[llvm] [CodeGen] Regen some old tests; NFC (PR #91250)
via llvm-commits
llvm-commits at lists.llvm.org
Mon May 6 11:16:51 PDT 2024
https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/91250
None
>From 39d99d94ff528cc4b9a391913e0110f039d3d8f8 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Mon, 6 May 2024 11:41:18 -0500
Subject: [PATCH] [CodeGen] Regen some old tests; NFC
---
.../AArch64/aarch64-address-type-promotion.ll | 14 +-
.../CodeGen/AArch64/arm64-narrow-st-merge.ll | 125 +
llvm/test/CodeGen/AArch64/bswap-known-bits.ll | 8 +-
...ist-and-by-const-from-shl-in-eqcmp-zero.ll | 10 +-
.../AArch64/pull-binop-through-shift.ll | 8 +-
llvm/test/CodeGen/AArch64/shift-mod.ll | 4 +-
...vector_splat-const-shift-of-constmasked.ll | 90 +-
llvm/test/CodeGen/AMDGPU/build_vector.ll | 373 +
llvm/test/CodeGen/AMDGPU/fneg.ll | 657 ++
.../AMDGPU/kernel-argument-dag-lowering.ll | 332 +
.../CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll | 57 +
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll | 3 +
.../AMDGPU/llvm.amdgcn.struct.buffer.load.ll | 156 +
.../llvm.amdgcn.struct.ptr.buffer.load.ll | 156 +
.../AMDGPU/llvm.r600.read.local.size.ll | 343 +
llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 7168 +++++++++++++++++
llvm/test/CodeGen/AMDGPU/sext-in-reg.ll | 2253 ++++++
llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll | 427 +
.../test/CodeGen/AMDGPU/shl-add-to-add-shl.ll | 31 +
llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll | 51 +
llvm/test/CodeGen/AMDGPU/store-private.ll | 1733 ++++
llvm/test/CodeGen/ARM/Windows/alloca.ll | 3 +
llvm/test/CodeGen/ARM/Windows/vla.ll | 4 +
llvm/test/CodeGen/ARM/and-cmpz.ll | 119 +
llvm/test/CodeGen/ARM/bfx.ll | 49 +-
llvm/test/CodeGen/ARM/sbfx.ll | 41 +-
llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll | 84 +-
llvm/test/CodeGen/ARM/shift-combine.ll | 127 +
llvm/test/CodeGen/BPF/remove_truncate_9.ll | 3 +
llvm/test/CodeGen/Mips/cins.ll | 49 +-
llvm/test/CodeGen/Mips/fabs.ll | 17 +-
llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll | 107 +-
llvm/test/CodeGen/Mips/fcopysign.ll | 123 +-
llvm/test/CodeGen/Mips/llvm-ir/abs.ll | 125 +-
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 219 +
llvm/test/CodeGen/NVPTX/mulwide.ll | 24 +-
.../NVPTX/unaligned-param-load-store.ll | 3 +
llvm/test/CodeGen/PowerPC/coalesce-ext.ll | 13 +-
llvm/test/CodeGen/PowerPC/extsh.ll | 1 +
llvm/test/CodeGen/PowerPC/shl_sext.ll | 1 +
llvm/test/CodeGen/SystemZ/int-abs-01.ll | 61 +-
llvm/test/CodeGen/SystemZ/int-cmp-44.ll | 466 +-
llvm/test/CodeGen/SystemZ/int-mul-10.ll | 41 +-
llvm/test/CodeGen/SystemZ/int-neg-02.ll | 86 +-
llvm/test/CodeGen/Thumb2/bfx.ll | 19 +-
llvm/test/CodeGen/VE/Scalar/bitreverse.ll | 1 +
llvm/test/CodeGen/WebAssembly/conv.ll | 3 +
.../CodeGen/WebAssembly/simd-sext-inreg.ll | 5 +
llvm/test/CodeGen/X86/lvi-hardening-loads.ll | 196 +-
llvm/test/CodeGen/X86/sext-subreg.ll | 13 +-
llvm/test/CodeGen/X86/x86-64-extend-shift.ll | 7 +-
51 files changed, 15408 insertions(+), 601 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll b/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
index d8280dadc550ea..e14618251b6d7d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-address-type-promotion.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -o - | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
@@ -5,13 +6,14 @@ target triple = "arm64-apple-macosx10.9"
; Check that sexts get promoted above adds.
define void @foo(ptr nocapture %a, i32 %i) {
+; CHECK-LABEL: foo:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: ldp w9, w10, [x8, #4]
+; CHECK-NEXT: add w9, w10, w9
+; CHECK-NEXT: str w9, [x8]
+; CHECK-NEXT: ret
entry:
-; CHECK-LABEL: _foo:
-; CHECK: add
-; CHECK-NEXT: ldp
-; CHECK-NEXT: add
-; CHECK-NEXT: str
-; CHECK-NEXT: ret
%add = add nsw i32 %i, 1
%idxprom = sext i32 %add to i64
%arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
diff --git a/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll b/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
index 81c3195584701c..01ad14b6fba52a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-narrow-st-merge.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple aarch64 -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple aarch64 -mattr=+strict-align -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-STRICT
@@ -7,6 +8,19 @@
; CHECK-STRICT: strh wzr
; CHECK-STRICT: strh wzr
define void @Strh_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strh_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sbfiz x8, x1, #1, #32
+; CHECK-NEXT: str wzr, [x0, x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strh_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: strh wzr, [x8]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #2]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i16, ptr %P, i64 %idxprom
@@ -26,6 +40,21 @@ entry:
; CHECK-STRICT: strh wzr
; CHECK-STRICT: strh wzr
define void @Strh_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strh_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sbfiz x8, x1, #1, #32
+; CHECK-NEXT: str xzr, [x0, x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strh_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: strh wzr, [x8]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #2]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #4]
+; CHECK-STRICT-NEXT: strh wzr, [x8, #6]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i16, ptr %P, i64 %idxprom
@@ -50,6 +79,18 @@ entry:
; CHECK-STRICT-LABEL: Strw_zero
; CHECK-STRICT: stp wzr, wzr
define void @Strw_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strw_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT: sbfiz x8, x1, #2, #32
+; CHECK-NEXT: str xzr, [x0, x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strw_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i32, ptr %P, i64 %idxprom
@@ -64,6 +105,17 @@ entry:
; CHECK-LABEL: Strw_zero_nonzero
; CHECK: stp wzr, w1
define void @Strw_zero_nonzero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strw_zero_nonzero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stp wzr, w1, [x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strw_zero_nonzero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, w1, [x8]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i32, ptr %P, i64 %idxprom
@@ -81,6 +133,18 @@ entry:
; CHECK-STRICT: stp wzr, wzr
; CHECK-STRICT: stp wzr, wzr
define void @Strw_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Strw_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stp xzr, xzr, [x8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Strw_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8]
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #8]
+; CHECK-STRICT-NEXT: ret
entry:
%idxprom = sext i32 %n to i64
%arrayidx = getelementptr inbounds i32, ptr %P, i64 %idxprom
@@ -106,6 +170,18 @@ entry:
; CHECK-STRICT: sturb wzr
; CHECK-STRICT: sturb wzr
define void @Sturb_zero(ptr nocapture %P, i32 %n) #0 {
+; CHECK-LABEL: Sturb_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw
+; CHECK-NEXT: sturh wzr, [x8, #-2]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturb_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw
+; CHECK-STRICT-NEXT: sturb wzr, [x8, #-2]
+; CHECK-STRICT-NEXT: sturb wzr, [x8, #-1]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -2
%idxprom = sext i32 %sub to i64
@@ -124,6 +200,18 @@ entry:
; CHECK-STRICT: sturh wzr
; CHECK-STRICT: sturh wzr
define void @Sturh_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturh_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-NEXT: stur wzr, [x8, #-6]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturh_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-4]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-6]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -2
%idxprom = sext i32 %sub to i64
@@ -144,6 +232,20 @@ entry:
; CHECK-STRICT: sturh wzr
; CHECK-STRICT: sturh wzr
define void @Sturh_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturh_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-NEXT: stur xzr, [x8, #-8]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturh_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #1
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-6]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-8]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-4]
+; CHECK-STRICT-NEXT: sturh wzr, [x8, #-2]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
%idxprom = sext i32 %sub to i64
@@ -169,6 +271,17 @@ entry:
; CHECK-STRICT-LABEL: Sturw_zero
; CHECK-STRICT: stp wzr, wzr
define void @Sturw_zero(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturw_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stur xzr, [x8, #-16]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturw_zero:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-16]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
%idxprom = sext i32 %sub to i64
@@ -187,6 +300,18 @@ entry:
; CHECK-STRICT: stp wzr, wzr
; CHECK-STRICT: stp wzr, wzr
define void @Sturw_zero_4(ptr nocapture %P, i32 %n) {
+; CHECK-LABEL: Sturw_zero_4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-NEXT: stp xzr, xzr, [x8, #-16]
+; CHECK-NEXT: ret
+;
+; CHECK-STRICT-LABEL: Sturw_zero_4:
+; CHECK-STRICT: // %bb.0: // %entry
+; CHECK-STRICT-NEXT: add x8, x0, w1, sxtw #2
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-16]
+; CHECK-STRICT-NEXT: stp wzr, wzr, [x8, #-8]
+; CHECK-STRICT-NEXT: ret
entry:
%sub = add nsw i32 %n, -3
%idxprom = sext i32 %sub to i64
diff --git a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
index 23619e47367d01..f13ef52f94a414 100644
--- a/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
+++ b/llvm/test/CodeGen/AArch64/bswap-known-bits.ll
@@ -8,7 +8,7 @@ declare i64 @llvm.bswap.i64(i64)
define i1 @test1(i16 %arg) {
; CHECK-LABEL: test1:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i16 %arg, 511
%b = call i16 @llvm.bswap.i16(i16 %a)
@@ -20,7 +20,7 @@ define i1 @test1(i16 %arg) {
define i1 @test2(i16 %arg) {
; CHECK-LABEL: test2:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i16 %arg, 1
%b = call i16 @llvm.bswap.i16(i16 %a)
@@ -32,7 +32,7 @@ define i1 @test2(i16 %arg) {
define i1 @test3(i16 %arg) {
; CHECK-LABEL: test3:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i16 %arg, 256
%b = call i16 @llvm.bswap.i16(i16 %a)
@@ -44,7 +44,7 @@ define i1 @test3(i16 %arg) {
define i1 @test4(i32 %arg) {
; CHECK-LABEL: test4:
; CHECK: ; %bb.0:
-; CHECK-NEXT: mov w0, #1
+; CHECK-NEXT: mov w0, #1 ; =0x1
; CHECK-NEXT: ret
%a = or i32 %arg, 2147483647 ; i32_MAX
%b = call i32 @llvm.bswap.i32(i32 %a)
diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index 32a62453202f40..60ceaf19731921 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -289,7 +289,7 @@ define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
define i1 @scalar_i32_x_is_const_eq(i32 %y) nounwind {
; CHECK-LABEL: scalar_i32_x_is_const_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #43605
+; CHECK-NEXT: mov w8, #43605 // =0xaa55
; CHECK-NEXT: movk w8, #43605, lsl #16
; CHECK-NEXT: lsl w8, w8, w0
; CHECK-NEXT: tst w8, #0x1
@@ -303,8 +303,8 @@ define i1 @scalar_i32_x_is_const_eq(i32 %y) nounwind {
define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
; CHECK-LABEL: scalar_i32_x_is_const2_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #1
-; CHECK-NEXT: mov w9, #43605
+; CHECK-NEXT: mov w8, #1 // =0x1
+; CHECK-NEXT: mov w9, #43605 // =0xaa55
; CHECK-NEXT: lsl w8, w8, w0
; CHECK-NEXT: movk w9, #43605, lsl #16
; CHECK-NEXT: tst w8, w9
@@ -319,7 +319,7 @@ define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: scalar_i8_bitsinmiddle_slt:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #24
+; CHECK-NEXT: mov w8, #24 // =0x18
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: lsl w8, w8, w1
; CHECK-NEXT: and w8, w8, w0
@@ -334,7 +334,7 @@ define i1 @scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
; CHECK-LABEL: scalar_i8_signbit_eq_with_nonzero:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-128
+; CHECK-NEXT: mov w8, #-128 // =0xffffff80
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-NEXT: lsl w8, w8, w1
; CHECK-NEXT: and w8, w8, w0
diff --git a/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll b/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
index b3fbe8bdb6e308..a892bb85692d3e 100644
--- a/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
+++ b/llvm/test/CodeGen/AArch64/pull-binop-through-shift.ll
@@ -81,7 +81,7 @@ define i32 @xor_nosignbit_shl(i32 %x, ptr %dst) {
define i32 @add_signbit_shl(i32 %x, ptr %dst) {
; CHECK-LABEL: add_signbit_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-16777216
+; CHECK-NEXT: mov w8, #-16777216 // =0xff000000
; CHECK-NEXT: add w0, w8, w0, lsl #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
@@ -93,7 +93,7 @@ define i32 @add_signbit_shl(i32 %x, ptr %dst) {
define i32 @add_nosignbit_shl(i32 %x, ptr %dst) {
; CHECK-LABEL: add_nosignbit_shl:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #-16777216
+; CHECK-NEXT: mov w8, #-16777216 // =0xff000000
; CHECK-NEXT: add w0, w8, w0, lsl #8
; CHECK-NEXT: str w0, [x1]
; CHECK-NEXT: ret
@@ -195,7 +195,7 @@ define i32 @add_signbit_lshr(i32 %x, ptr %dst) {
define i32 @add_nosignbit_lshr(i32 %x, ptr %dst) {
; CHECK-LABEL: add_nosignbit_lshr:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147418112
+; CHECK-NEXT: mov w8, #2147418112 // =0x7fff0000
; CHECK-NEXT: add w8, w0, w8
; CHECK-NEXT: lsr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
@@ -298,7 +298,7 @@ define i32 @add_signbit_ashr(i32 %x, ptr %dst) {
define i32 @add_nosignbit_ashr(i32 %x, ptr %dst) {
; CHECK-LABEL: add_nosignbit_ashr:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147418112
+; CHECK-NEXT: mov w8, #2147418112 // =0x7fff0000
; CHECK-NEXT: add w8, w0, w8
; CHECK-NEXT: asr w0, w8, #8
; CHECK-NEXT: str w0, [x1]
diff --git a/llvm/test/CodeGen/AArch64/shift-mod.ll b/llvm/test/CodeGen/AArch64/shift-mod.ll
index a90603195cf348..ac95b75168ed98 100644
--- a/llvm/test/CodeGen/AArch64/shift-mod.ll
+++ b/llvm/test/CodeGen/AArch64/shift-mod.ll
@@ -127,7 +127,7 @@ define i64 @ashr_add_shl_i36(i64 %r) {
define i64 @ashr_add_shl_mismatch_shifts1(i64 %r) {
; CHECK-LABEL: ashr_add_shl_mismatch_shifts1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #4294967296
+; CHECK-NEXT: mov x8, #4294967296 // =0x100000000
; CHECK-NEXT: add x8, x8, x0, lsl #8
; CHECK-NEXT: asr x0, x8, #32
; CHECK-NEXT: ret
@@ -140,7 +140,7 @@ define i64 @ashr_add_shl_mismatch_shifts1(i64 %r) {
define i64 @ashr_add_shl_mismatch_shifts2(i64 %r) {
; CHECK-LABEL: ashr_add_shl_mismatch_shifts2:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #4294967296
+; CHECK-NEXT: mov x8, #4294967296 // =0x100000000
; CHECK-NEXT: add x8, x8, x0, lsr #8
; CHECK-NEXT: lsr x0, x8, #8
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
index 7e958b266846a1..6525d6cd7458b5 100644
--- a/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/vector_splat-const-shift-of-constmasked.ll
@@ -328,7 +328,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_lshr_1(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_3(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #3
@@ -340,7 +340,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_3(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_4(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #4
@@ -352,7 +352,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_4(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_5(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #5
@@ -364,7 +364,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_5(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_lshr_6(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_lshr_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #6
@@ -432,7 +432,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_ashr_1(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_3(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #3
@@ -444,7 +444,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_3(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_4(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #4
@@ -456,7 +456,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_4(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_5(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #5
@@ -468,7 +468,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_5(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_ashr_6(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_ashr_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #6
@@ -565,7 +565,7 @@ define <8 x i16> @test_128_i16_x_8_127_mask_shl_10(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_3(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_3:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #3
@@ -577,7 +577,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_3(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_4(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_4:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #4
@@ -589,7 +589,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_4(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_5(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_5:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #5
@@ -601,7 +601,7 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_5(<8 x i16> %a0) {
define <8 x i16> @test_128_i16_x_8_2032_mask_shl_6(<8 x i16> %a0) {
; CHECK-LABEL: test_128_i16_x_8_2032_mask_shl_6:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2032
+; CHECK-NEXT: mov w8, #2032 // =0x7f0
; CHECK-NEXT: dup v1.8h, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.8h, v0.8h, #6
@@ -644,7 +644,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_lshr_1(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_7(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_7:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #7
@@ -656,7 +656,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_7(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_8(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #8
@@ -668,7 +668,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_8(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_9(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #9
@@ -680,7 +680,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_9(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_lshr_10(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_lshr_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #10
@@ -748,7 +748,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_ashr_1(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_7(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_7:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #7
@@ -760,7 +760,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_7(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_8(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #8
@@ -772,7 +772,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_8(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_9(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #9
@@ -784,7 +784,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_9(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_ashr_10(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_ashr_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #10
@@ -881,7 +881,7 @@ define <4 x i32> @test_128_i32_x_4_32767_mask_shl_18(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_7(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_7:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #7
@@ -893,7 +893,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_7(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_8(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #8
@@ -905,7 +905,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_8(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_9(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_9:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #9
@@ -917,7 +917,7 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_9(<4 x i32> %a0) {
define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_10(<4 x i32> %a0) {
; CHECK-LABEL: test_128_i32_x_4_8388352_mask_shl_10:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8388352
+; CHECK-NEXT: mov w8, #8388352 // =0x7fff00
; CHECK-NEXT: dup v1.4s, w8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.4s, v0.4s, #10
@@ -948,7 +948,7 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) {
define <2 x i64> @test_128_i64_x_2_2147483647_mask_lshr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_lshr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #1
@@ -961,7 +961,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_lshr_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_15(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #15
@@ -973,7 +973,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_16(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #16
@@ -985,7 +985,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_16(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_17(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #17
@@ -997,7 +997,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_17(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_18(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_lshr_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #18
@@ -1010,7 +1010,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_lshr_18(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #1
@@ -1022,7 +1022,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_1(<2 x i64> %a
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_32(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_lshr_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #32
@@ -1055,7 +1055,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_lshr_34(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_2147483647_mask_ashr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #1
@@ -1068,7 +1068,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_ashr_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #15
@@ -1080,7 +1080,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_16(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #16
@@ -1092,7 +1092,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_16(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_17(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #17
@@ -1104,7 +1104,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_17(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_18(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_ashr_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ushr v0.2d, v0.2d, #18
@@ -1117,7 +1117,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_ashr_18(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: sshr v0.2d, v0.2d, #1
@@ -1129,7 +1129,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_1(<2 x i64> %a
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_32(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: sshr v0.2d, v0.2d, #32
@@ -1162,7 +1162,7 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %
define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: add v0.2d, v0.2d, v0.2d
@@ -1174,7 +1174,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_32(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_2147483647_mask_shl_32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #2147483647
+; CHECK-NEXT: mov w8, #2147483647 // =0x7fffffff
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #32
@@ -1205,7 +1205,7 @@ define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_34(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_15(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_15:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #15
@@ -1217,7 +1217,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_15(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_16(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #16
@@ -1229,7 +1229,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_16(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_17(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_17:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #17
@@ -1241,7 +1241,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_17(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_140737488289792_mask_shl_18:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #140737488289792
+; CHECK-NEXT: mov x8, #140737488289792 // =0x7fffffff0000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: shl v0.2d, v0.2d, #18
@@ -1254,7 +1254,7 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) {
define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0) {
; CHECK-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #-8589934592
+; CHECK-NEXT: mov x8, #-8589934592 // =0xfffffffe00000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: add v0.2d, v0.2d, v0.2d
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 99755133f36d6a..a693e13f37ea36 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefixes=R600,ALL
; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GFX6,GFX678,ALL
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
@@ -17,6 +18,72 @@
; GFX10: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX11: global_store_b64 v2, v[0:1], s[0:1]
define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector2:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.Y, literal.x,
+; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector2:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, 6
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector2:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 6
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector2:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, 6
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector2:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 5
+; GFX11-NEXT: v_mov_b32_e32 v1, 6
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector2:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, 6
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
ret void
@@ -40,6 +107,86 @@ entry:
; GFX10: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX11: global_store_b128 v4, v[0:3], s[0:1]
define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector4:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV * T0.W, literal.x,
+; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; R600-NEXT: MOV * T0.Z, literal.x,
+; R600-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; R600-NEXT: MOV * T0.Y, literal.x,
+; R600-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; R600-NEXT: MOV T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 5(7.006492e-45), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector4:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 5
+; GFX6-NEXT: v_mov_b32_e32 v1, 6
+; GFX6-NEXT: v_mov_b32_e32 v2, 7
+; GFX6-NEXT: v_mov_b32_e32 v3, 8
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector4:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 5
+; GFX8-NEXT: v_mov_b32_e32 v1, 6
+; GFX8-NEXT: v_mov_b32_e32 v2, 7
+; GFX8-NEXT: v_mov_b32_e32 v3, 8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector4:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, 6
+; GFX10-NEXT: v_mov_b32_e32 v2, 7
+; GFX10-NEXT: v_mov_b32_e32 v3, 8
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector4:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, 5
+; GFX11-NEXT: v_mov_b32_e32 v1, 6
+; GFX11-NEXT: v_mov_b32_e32 v2, 7
+; GFX11-NEXT: v_mov_b32_e32 v3, 8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector4:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v4, 0
+; GFX940-NEXT: v_mov_b32_e32 v0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, 6
+; GFX940-NEXT: v_mov_b32_e32 v2, 7
+; GFX940-NEXT: v_mov_b32_e32 v3, 8
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
ret void
@@ -60,6 +207,65 @@ entry:
; GFX10: global_store_dword v0, v1, s[0:1]
; GFX11: global_store_b32 v0, v1, s[0:1]
define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
+; R600-LABEL: build_vector_v2i16:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: MOV T4.X, literal.x,
+; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: 393221(5.510200e-40), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector_v2i16:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x60005
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector_v2i16:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x60005
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector_v2i16:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector_v2i16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector_v2i16:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
ret void
@@ -90,6 +296,82 @@ entry:
; GFX10: global_store_dword v0, v1, s[0:1]
; GFX11: global_store_b32 v0, v1, s[0:1]
define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 %a) {
+; R600-LABEL: build_vector_v2i16_trunc:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: OR_INT T4.X, PV.W, literal.x,
+; R600-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
+; R600-NEXT: 327680(4.591775e-40), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_vector_v2i16_trunc:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_alignbit_b32 v0, 5, s4, 16
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_vector_v2i16_trunc:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_lshr_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s4, s4, 0x50000
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_vector_v2i16_trunc:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s2, s[0:1], 0x8
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshr_b32 s2, s2, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, 5
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_vector_v2i16_trunc:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_pack_hl_b32_b16 s2, s2, 5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_vector_v2i16_trunc:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshr_b32 s0, s4, 16
+; GFX940-NEXT: s_pack_ll_b32_b16 s0, s0, 5
+; GFX940-NEXT: v_mov_b32_e32 v1, s0
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
+; GFX940-NEXT: s_endpgm
%srl = lshr i32 %a, 16
%trunc = trunc i32 %srl to i16
%ins.0 = insertelement <2 x i16> undef, i16 %trunc, i32 0
@@ -186,6 +468,93 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_endpgm
define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
+; R600-LABEL: build_v2i32_from_v4i16_shuffle:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 0, @10, KC0[], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3
+; R600-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV * T0.X, 0.0,
+; R600-NEXT: ALU clause starting at 11:
+; R600-NEXT: LSHL * T0.Y, T1.X, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: LSHL T0.X, T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, s0
+; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshl_b32 s3, s3, 16
+; GFX940-NEXT: s_lshl_b32 s2, s2, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
entry:
%shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
%zextended = zext <2 x i16> %shuf to <2 x i32>
@@ -193,3 +562,7 @@ entry:
store <2 x i32> %shifted, ptr addrspace(1) %out
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
+; GFX1011: {{.*}}
+; GFX678: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 03ca780c903226..c7677942719de1 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,GFX11 %s
@@ -10,6 +11,45 @@
; GCN: s_xor_b32 [[NEG_VAL:s[0-9]+]], [[VAL]], 0x80000000
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[NEG_VAL]]
define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: s_fneg_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub float -0.000000e+00, %in
store float %fneg, ptr addrspace(1) %out
ret void
@@ -22,6 +62,52 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) {
+; SI-LABEL: s_fneg_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; SI-NEXT: s_xor_b32 s1, s2, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s1
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT: s_xor_b32 s1, s2, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
store <2 x float> %fneg, ptr addrspace(1) %out
ret void
@@ -38,6 +124,61 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) {
+; SI-LABEL: s_fneg_v4f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s7, s7, 0x80000000
+; SI-NEXT: s_xor_b32 s6, s6, 0x80000000
+; SI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_mov_b32_e32 v2, s6
+; SI-NEXT: v_mov_b32_e32 v3, s7
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v4f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s7, s7, 0x80000000
+; VI-NEXT: s_xor_b32 s6, s6, 0x80000000
+; VI-NEXT: s_xor_b32 s5, s5, 0x80000000
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000
+; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000
+; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000
+; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
store <4 x float> %fneg, ptr addrspace(1) %out
ret void
@@ -54,6 +195,41 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; R600-NOT: XOR
; R600: -KC0[2].Z
define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fsub0_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_sub_f32_e64 v0, 0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fsub0_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f32_e64 v0, 0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fsub0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f32_e64 v0, 0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fsub = fsub float 0.0, %bc
store float %fsub, ptr addrspace(1) %out
@@ -71,6 +247,45 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
; R600-NOT: XOR
; R600: -PV.W
define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: fneg_free_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_free_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fneg_free_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%bc = bitcast i32 %in to float
%fsub = fsub float -0.0, %bc
store float %fsub, ptr addrspace(1) %out
@@ -84,6 +299,41 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
; GCN-NOT: xor
; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: fneg_fold_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_f32_e64 v0, -s4, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: fneg_fold_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v0, -s4, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: fneg_fold_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e64 v0, -s2, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fsub = fsub float -0.0, %in
%fmul = fmul float %fsub, %in
store float %fmul, ptr addrspace(1) %out
@@ -94,6 +344,41 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
; FUNC-LABEL: {{^}}bitpreserve_fneg_f32:
; GCN: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -4.0
define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) {
+; SI-LABEL: bitpreserve_fneg_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mul_f32_e64 v0, s4, -4.0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: bitpreserve_fneg_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e64 v0, s4, -4.0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: bitpreserve_fneg_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in.bc = bitcast float %in to i32
%int.abs = xor i32 %in.bc, 2147483648
%bc = bitcast i32 %int.abs to float
@@ -107,6 +392,45 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000
; GCN: v_mov_b32_e32 [[V_FNEG:v[0-9]+]], [[FNEG]]
define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fneg_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i32 %in, -2147483648
store i32 %fneg, ptr addrspace(1) %out
ret void
@@ -117,6 +441,11 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: s_setpc_b64
define i32 @v_fneg_i32(i32 %in) {
+; FUNC-LABEL: v_fneg_i32:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i32 %in, -2147483648
ret i32 %fneg
}
@@ -125,6 +454,41 @@ define i32 @v_fneg_i32(i32 %in) {
; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]]
; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]]
define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_fneg_i32_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_sub_f32_e64 v0, 2.0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i32_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f32_e64 v0, 2.0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i32_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f32_e64 v0, 2.0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i32 %in, -2147483648
%bitcast = bitcast i32 %fneg to float
%fadd = fadd float %bitcast, 2.0
@@ -137,6 +501,11 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0
; GCN-NEXT: s_setpc_b64
define float @v_fneg_i32_fp_use(i32 %in) {
+; FUNC-LABEL: v_fneg_i32_fp_use:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i32 %in, -2147483648
%bitcast = bitcast i32 %fneg to float
%fadd = fadd float %bitcast, 2.0
@@ -146,6 +515,49 @@ define float @v_fneg_i32_fp_use(i32 %in) {
; FUNC-LABEL: {{^}}s_fneg_i64:
; GCN: s_xor_b32 s[[NEG_HI:[0-9]+]], s{{[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: s_fneg_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
+; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i64 %in, -9223372036854775808
store i64 %fneg, ptr addrspace(1) %out
ret void
@@ -156,6 +568,11 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: s_setpc_b64
define i64 @v_fneg_i64(i64 %in) {
+; FUNC-LABEL: v_fneg_i64:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i64 %in, -9223372036854775808
ret i64 %fneg
}
@@ -163,6 +580,39 @@ define i64 @v_fneg_i64(i64 %in) {
; FUNC-LABEL: {{^}}s_fneg_i64_fp_use:
; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, -s{{\[[0-9]+:[0-9]+\]}}, 2.0
define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: s_fneg_i64_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i64_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i64_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i64 %in, -9223372036854775808
%bitcast = bitcast i64 %fneg to double
%fadd = fadd double %bitcast, 2.0
@@ -175,6 +625,11 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
; GCN-NEXT: s_setpc_b64
define double @v_fneg_i64_fp_use(i64 %in) {
+; FUNC-LABEL: v_fneg_i64_fp_use:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i64 %in, -9223372036854775808
%bitcast = bitcast i64 %fneg to double
%fadd = fadd double %bitcast, 2.0
@@ -186,6 +641,11 @@ define double @v_fneg_i64_fp_use(i64 %in) {
; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GCN-NEXT: s_setpc_b64
define i16 @v_fneg_i16(i16 %in) {
+; FUNC-LABEL: v_fneg_i16:
+; FUNC: ; %bb.0:
+; FUNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FUNC-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; FUNC-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i16 %in, -32768
ret i16 %fneg
}
@@ -198,6 +658,43 @@ define i16 @v_fneg_i16(i16 %in) {
; VI: s_load_dword [[IN:s[0-9]+]]
; VI: v_sub_f16_e64 v{{[0-9]+}}, 2.0, [[IN]]
define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
+; SI-LABEL: s_fneg_i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e64 v0, 2.0, s4
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_sub_f16_e64 v0, 2.0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%fneg = xor i16 %in, -32768
%bitcast = bitcast i16 %fneg to half
%fadd = fadd half %bitcast, 2.0
@@ -215,6 +712,24 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
; VI-NEXT: s_setpc_b64
define half @v_fneg_i16_fp_use(i16 %in) {
+; SI-LABEL: v_fneg_i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg = xor i16 %in, -32768
%bitcast = bitcast i16 %fneg to half
%fadd = fadd half %bitcast, 2.0
@@ -231,6 +746,50 @@ define half @v_fneg_i16_fp_use(i16 %in) {
; VI: s_lshl_b32 s5, s5, 16
; VI: s_or_b32 s4, s4, s5
define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
+; SI-LABEL: s_fneg_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_xor_b32 s4, s4, 0x80008000
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s5, s4, 16
+; VI-NEXT: s_xor_b32 s4, s4, 0x8000
+; VI-NEXT: s_xor_b32 s5, s5, 0x8000
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
store <2 x i16> %fneg, ptr addrspace(1) %out
@@ -249,6 +808,28 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; VI-NEXT: s_setpc_b64
define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
+; SI-LABEL: v_fneg_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v2
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
ret <2 x i16> %fneg
}
@@ -268,6 +849,56 @@ define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) {
; VI: v_add_f16_e64 v1, s4, 2.0
; VI: v_or_b32_e32 v0, v1, v0
define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) {
+; SI-LABEL: s_fneg_v2i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: s_fneg_v2i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: v_mov_b32_e32 v1, 0x4000
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshr_b32 s5, s4, 16
+; VI-NEXT: s_xor_b32 s5, s5, 0x8000
+; VI-NEXT: s_xor_b32 s4, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_add_f16_e64 v1, s4, 2.0
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_fneg_v2i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
%bitcast = bitcast <2 x i16> %fneg to <2 x half>
@@ -290,9 +921,35 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
; VI: v_or_b32_e32 v0, v0, v1
; VI: s_setpc_b64
define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) {
+; SI-LABEL: v_fneg_v2i16_fp_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_fneg_v2i16_fp_use:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, 0x4000
+; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_v2i16_fp_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%in = bitcast i32 %arg to <2 x i16>
%fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768>
%bitcast = bitcast <2 x i16> %fneg to <2 x half>
%fadd = fadd <2 x half> %bitcast, <half 2.0, half 2.0>
ret <2 x half> %fadd
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
index 1a73df341108fe..8a9d731334ec5f 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HSA-VI,FUNC %s
; Repeat of some problematic tests in kernel-args.ll, with the IR
@@ -11,6 +12,16 @@
; HSA-VI: .amdhsa_kernarg_size 12
define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
+; GCN-LABEL: i1_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s2, s2, 1
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_byte v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
store i1 %x, ptr addrspace(1) %out, align 1
ret void
}
@@ -22,6 +33,20 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
; HSA-VI: .amdhsa_kernarg_size 12
define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind {
+; GCN-LABEL: v3i8_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshr_b32 s3, s2, 8
+; GCN-NEXT: s_and_b32 s4, s2, 0xff
+; GCN-NEXT: v_lshlrev_b16_e64 v2, 8, s3
+; GCN-NEXT: v_or_b32_e32 v2, s4, v2
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_short v0, v2, s[0:1]
+; GCN-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2
+; GCN-NEXT: s_endpgm
entry:
store <3 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -32,6 +57,19 @@ entry:
; HSA-VI: .amdhsa_kernarg_size 24
define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind {
+; GCN-LABEL: i65_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x10
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, s6, 1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: global_store_byte v2, v3, s[0:1] offset:8
+; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-NEXT: s_endpgm
entry:
store i65 %in, ptr addrspace(1) %out, align 4
ret void
@@ -40,6 +78,9 @@ entry:
; FUNC-LABEL: {{^}}empty_struct_arg:
; HSA-VI: .amdhsa_kernarg_size 0
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+; GCN-LABEL: empty_struct_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_endpgm
ret void
}
@@ -61,6 +102,30 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
; HSA-VI: .amdhsa_kernarg_size 40
define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+; GCN-LABEL: struct_argument_alignment:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s6, s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GCN-NEXT: s_load_dword s7, s[4:5], 0x18
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x20
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg1, 0
@@ -83,6 +148,28 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; HSA-VI: .amdhsa_kernarg_size 28
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+; GCN-LABEL: packed_struct_argument_alignment:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: global_load_dword v6, v2, s[4:5] offset:13
+; GCN-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:17
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v7, s2
+; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: global_store_dword v[2:3], v7, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v[2:3], v6, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1
%val2 = extractvalue <{i32, i64}> %arg1, 0
@@ -103,6 +190,37 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; HSA-VI: .amdhsa_kernarg_size 64
define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
+; GCN-LABEL: struct_argument_alignment_after:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s10, s[4:5], 0x0
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
+; GCN-NEXT: s_load_dword s11, s[4:5], 0x18
+; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x20
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: global_store_dword v[4:5], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: global_store_dword v[4:5], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg2, 0
@@ -118,6 +236,23 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
; GCN-LABEL: {{^}}array_3xi32:
; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
+; GCN-LABEL: array_3xi32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: global_store_short v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v[0:1], v1, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v[0:1], v2, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: global_store_dword v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
store volatile i16 %arg0, ptr addrspace(1) undef
store volatile [3 x i32] %arg1, ptr addrspace(1) undef
ret void
@@ -126,6 +261,21 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
; GCN-LABEL: {{^}}array_3xi16:
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
+; GCN-LABEL: array_3xi16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: global_store_byte v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_short_d16_hi v[0:1], v1, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_short v[0:1], v1, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_short_d16_hi v[0:1], v0, off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
store volatile i8 %arg0, ptr addrspace(1) undef
store volatile [3 x i16] %arg1, ptr addrspace(1) undef
ret void
@@ -136,6 +286,20 @@ define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
; GCN-DAG: s_bfe_u32 [[BFE:s[0-9]+]], [[DWORD]], 0x100010{{$}}
; GCN-DAG: s_and_b32 [[AND:s[0-9]+]], [[DWORD]], 0x7fff{{$}}
define amdgpu_kernel void @v2i15_arg(ptr addrspace(1) nocapture %out, <2 x i15> %in) {
+; GCN-LABEL: v2i15_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s3, s2, 0x7fff
+; GCN-NEXT: s_bfe_u32 s2, s2, 0x100010
+; GCN-NEXT: s_lshl_b32 s2, s2, 15
+; GCN-NEXT: s_or_b32 s2, s3, s2
+; GCN-NEXT: s_andn2_b32 s2, s2, -2.0
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
entry:
store <2 x i15> %in, ptr addrspace(1) %out, align 4
ret void
@@ -148,6 +312,25 @@ entry:
; GCN: s_and_b32
; GCN: s_or_b32
define amdgpu_kernel void @v3i15_arg(ptr addrspace(1) nocapture %out, <3 x i15> %in) {
+; GCN-LABEL: v3i15_arg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s4, s3, 0xffff
+; GCN-NEXT: s_and_b32 s5, s2, 0x7fff
+; GCN-NEXT: s_lshr_b32 s6, s2, 1
+; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 30
+; GCN-NEXT: s_and_b32 s4, s6, 0x3fff8000
+; GCN-NEXT: s_and_b32 s6, s3, 0x1fff
+; GCN-NEXT: s_or_b32 s4, s5, s4
+; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT: global_store_short v0, v1, s[0:1] offset:4
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
entry:
store <3 x i15> %in, ptr addrspace(1) %out, align 4
ret void
@@ -159,6 +342,14 @@ entry:
; GCN: global_load_ubyte v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
; GCN: .amdhsa_kernarg_size 12
define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i8) %in.byref) {
+; GCN-LABEL: byref_constant_i8_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_load_ubyte v1, v0, s[4:5] offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i8, ptr addrspace(4) %in.byref
%ext = zext i8 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -170,6 +361,14 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
; GCN: global_load_ushort v{{[0-9]+}}, [[ZERO]], s[4:5] offset:8
; GCN: .amdhsa_kernarg_size 12
define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i16) %in.byref) {
+; GCN-LABEL: byref_constant_i16_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i16, ptr addrspace(4) %in.byref
%ext = zext i16 %in to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -180,6 +379,18 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
; GCN: .amdhsa_kernarg_size 16
define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_constant_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v0, v2, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -191,6 +402,23 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x20{{$}}
; GCN: .amdhsa_kernarg_size 36
define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(<4 x i32>) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_constant_v4i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GCN-NEXT: s_load_dword s8, s[4:5], 0x20
+; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: v_mov_b32_e32 v5, s8
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v4, v5, s[6:7]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load <4 x i32>, ptr addrspace(4) %in.byref
store volatile <4 x i32> %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -205,6 +433,19 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s
; GCN: .amdhsa_kernarg_size 264
define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_align_constant_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v0, v2, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store volatile i32 %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -216,6 +457,41 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; GCN-DAG: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x40{{$}}
; GCN: .amdhsa_kernarg_size 132
define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) {
+; GCN-LABEL: byref_natural_align_constant_v16i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x80
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v4, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s20
+; GCN-NEXT: v_mov_b32_e32 v1, s21
+; GCN-NEXT: v_mov_b32_e32 v2, s22
+; GCN-NEXT: v_mov_b32_e32 v3, s23
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: v_mov_b32_e32 v1, s17
+; GCN-NEXT: v_mov_b32_e32 v2, s18
+; GCN-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: v_mov_b32_e32 v1, s13
+; GCN-NEXT: v_mov_b32_e32 v2, s14
+; GCN-NEXT: v_mov_b32_e32 v3, s15
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NEXT: v_mov_b32_e32 v3, s11
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: global_store_dword v4, v0, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in = load <16 x i32>, ptr addrspace(4) %in.byref
store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4
store volatile i32 %after.offset, ptr addrspace(1) %out, align 4
@@ -227,6 +503,15 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}}
; GCN: .amdhsa_kernarg_size 12
define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(1) byref(i32) %in.byref) {
+; GCN-LABEL: byref_global_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s2, s[4:5], 0x8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(1) %in.byref
store i32 %in, ptr addrspace(1) %out, align 4
ret void
@@ -235,6 +520,16 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
; GCN-LABEL: {{^}}byref_flat_i32_arg:
; GCN: flat_load_dword [[IN:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} offset:8{{$}}
define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, ptr byref(i32) %in.byref) {
+; GCN-LABEL: byref_flat_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: flat_load_dword v0, v[0:1] offset:8
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_endpgm
%in = load i32, ptr %in.byref
store i32 %in, ptr addrspace(1) %out, align 4
ret void
@@ -245,6 +540,17 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
; GCN: s_mov_b32 s[[PTR_HI:[0-9]+]], 0{{$}}
; GCN: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(6) byref(i32) %in.byref) {
+; GCN-LABEL: byref_constant_32bit_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s0, s4, 8
+; GCN-NEXT: s_mov_b32 s1, 0
+; GCN-NEXT: s_load_dword s6, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(6) %in.byref
store i32 %in, ptr addrspace(1) %out, align 4
ret void
@@ -260,6 +566,22 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
; GCN: .amdhsa_kernarg_size 20
define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) %in0.byref, ptr addrspace(4) byref(i32) %in1.byref, i32 %after.offset) {
+; GCN-LABEL: multi_byref_constant_i32_arg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x10
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: global_store_dword v0, v2, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%in0 = load i32, ptr addrspace(4) %in0.byref
%in1 = load i32, ptr addrspace(4) %in1.byref
store volatile i32 %in0, ptr addrspace(1) %out, align 4
@@ -274,6 +596,13 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x0{{$}}
; GCN: .amdhsa_kernarg_size 4
define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref(i32) %in.byref) {
+; GCN-LABEL: byref_constant_i32_arg_offset0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: global_store_dword v[0:1], v0, off
+; GCN-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
store i32 %in, ptr addrspace(1) undef, align 4
ret void
@@ -281,3 +610,6 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; HSA-VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 90e18a881340b3..5d243e3a5890a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
@@ -5,6 +6,18 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
; CHECK-LABEL: {{^}}ds_bpermute:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @ds_bpermute(ptr addrspace(1) %out, i32 %index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: ds_bpermute_b32 v2, v0, v1
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
store i32 %bpermute, ptr addrspace(1) %out, align 4
ret void
@@ -13,6 +26,18 @@ define amdgpu_kernel void @ds_bpermute(ptr addrspace(1) %out, i32 %index, i32 %s
; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
define amdgpu_kernel void @ds_bpermute_imm_offset(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_imm_offset:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: ds_bpermute_b32 v2, v0, v1 offset:4
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%index = add i32 %base_index, 4
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
store i32 %bpermute, ptr addrspace(1) %out, align 4
@@ -22,6 +47,19 @@ define amdgpu_kernel void @ds_bpermute_imm_offset(ptr addrspace(1) %out, i32 %ba
; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
define amdgpu_kernel void @ds_bpermute_imm_index(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_imm_index:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0xc
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: ds_bpermute_b32 v2, v0, v1 offset:64
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
store i32 %bpermute, ptr addrspace(1) %out, align 4
ret void
@@ -31,6 +69,15 @@ define amdgpu_kernel void @ds_bpermute_imm_index(ptr addrspace(1) %out, i32 %bas
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
; CHECK: s_waitcnt lgkmcnt
define void @ds_bpermute_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_add_shl:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; CHECK-NEXT: ds_bpermute_b32 v2, v2, v3 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%index = add i32 %base_index, 1
%byte_index = shl i32 %index, 2
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
@@ -42,6 +89,16 @@ define void @ds_bpermute_add_shl(ptr addrspace(1) %out, i32 %base_index, i32 %sr
; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
; CHECK: s_waitcnt lgkmcnt
define void @ds_bpermute_or_shl(ptr addrspace(1) %out, i32 %base_index, i32 %src) nounwind {
+; CHECK-LABEL: ds_bpermute_or_shl:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v2, 62, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; CHECK-NEXT: ds_bpermute_b32 v2, v2, v3 offset:4
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%masked = and i32 %base_index, 62
%index = or i32 %masked, 1
%byte_index = shl i32 %index, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index 25b36173cc65b5..61b4d240ef6942 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
@@ -554,3 +555,5 @@ declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
index 5e03748bee08f5..94fbd0137a509e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
@@ -7,6 +8,14 @@
;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v8, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -21,6 +30,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load_immoffs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0)
ret <4 x float> %data
@@ -31,6 +46,13 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load_immoffs_large:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_movk_i32 s4, 0x1ffc
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0)
ret <4 x float> %data
@@ -40,6 +62,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+; CHECK-LABEL: buffer_load_idx:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0)
ret <4 x float> %data
@@ -49,6 +76,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -58,6 +93,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs_imm:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%ofs = add i32 %1, 60
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -68,6 +111,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0)
ret <4 x float> %data
@@ -78,6 +126,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both_reversed:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -87,6 +141,11 @@ main_body:
;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x1:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret float %data
@@ -96,6 +155,11 @@ main_body:
;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x2:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret <2 x float> %data
@@ -105,6 +169,14 @@ main_body:
;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
+; VI-LABEL: buffer_load_negative_offset:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
main_body:
%ofs.1 = add i32 %ofs, -16
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i32 0, i32 0)
@@ -117,6 +189,16 @@ main_body:
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
+; VI-LABEL: buffer_load_mmo:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: buffer_load_dword v1, v2, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: ds_write2_b32 v0, v2, v2 offset1:4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, v1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ; return to shader part epilog
entry:
store float 0.0, ptr addrspace(3) %lds
%val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -131,6 +213,14 @@ entry:
;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) {
+; CHECK-LABEL: buffer_load_int:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v6, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dword v6, v6, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <2 x i32> @llvm.amdgcn.struct.buffer.load.v2i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -151,6 +241,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_ubyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_ubyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i8 %tmp to i32
@@ -165,6 +261,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_ushort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_ushort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i16 %tmp to i32
@@ -179,6 +281,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_sbyte(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_sbyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i8 %tmp to i32
@@ -193,6 +301,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_buffer_load_sshort(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_buffer_load_sshort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i16 %tmp to i32
@@ -206,6 +320,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store half %val, ptr addrspace(3) %ptr
@@ -218,6 +339,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v2f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x half> %val, ptr addrspace(3) %ptr
@@ -230,6 +358,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v4f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
@@ -242,6 +377,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store i16 %val, ptr addrspace(3) %ptr
@@ -254,6 +396,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v2i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v2i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x i16> @llvm.amdgcn.struct.buffer.load.v2i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x i16> %val, ptr addrspace(3) %ptr
@@ -266,6 +415,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_buffer_load_v4i16(<4 x i32> inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_buffer_load_v4i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x i16> @llvm.amdgcn.struct.buffer.load.v4i16(<4 x i32> %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
index 2f9e6b0a1cf526..71adf4b2aaeab6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI
@@ -7,6 +8,14 @@
;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v8, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -21,6 +30,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0)
ret <4 x float> %data
@@ -31,6 +46,13 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_immoffs_large:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_movk_i32 s4, 0x1ffc
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 8188, i32 0)
ret <4 x float> %data
@@ -40,6 +62,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_idx:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0)
ret <4 x float> %data
@@ -49,6 +76,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -58,6 +93,14 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) {
+; CHECK-LABEL: buffer_load_ofs_imm:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%ofs = add i32 %1, 60
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -68,6 +111,11 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0)
ret <4 x float> %data
@@ -78,6 +126,12 @@ main_body:
;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) {
+; CHECK-LABEL: buffer_load_both_reversed:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -87,6 +141,11 @@ main_body:
;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x1:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret float %data
@@ -96,6 +155,11 @@ main_body:
;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
;CHECK: s_waitcnt
define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: buffer_load_x2:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
ret <2 x float> %data
@@ -105,6 +169,14 @@ main_body:
;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0
;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen
define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) {
+; VI-LABEL: buffer_load_negative_offset:
+; VI: ; %bb.0: ; %main_body
+; VI-NEXT: s_mov_b32 s4, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: ; return to shader part epilog
main_body:
%ofs.1 = add i32 %ofs, -16
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i32 0, i32 0)
@@ -117,6 +189,16 @@ main_body:
; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) {
+; VI-LABEL: buffer_load_mmo:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: buffer_load_dword v1, v2, s[0:3], 0 idxen
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: ds_write2_b32 v0, v2, v2 offset1:4
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, v1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ; return to shader part epilog
entry:
store float 0.0, ptr addrspace(3) %lds
%val = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
@@ -131,6 +213,14 @@ entry:
;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
;CHECK: s_waitcnt
define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) {
+; CHECK-LABEL: buffer_load_int:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: v_mov_b32_e32 v6, 0
+; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; CHECK-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc
+; CHECK-NEXT: buffer_load_dword v6, v6, s[0:3], 0 idxen slc
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <2 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -151,6 +241,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ubyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i8 %tmp to i32
@@ -165,6 +261,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_ushort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_ushort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = zext i16 %tmp to i32
@@ -179,6 +281,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sbyte:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i8 %tmp to i32
@@ -193,6 +301,12 @@ main_body:
;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
;CHECK-NEXT: ; return to shader part epilog
define amdgpu_ps float @struct_ptr_buffer_load_sshort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) {
+; CHECK-LABEL: struct_ptr_buffer_load_sshort:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
+; CHECK-NEXT: ; return to shader part epilog
main_body:
%tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0)
%tmp2 = sext i16 %tmp to i32
@@ -206,6 +320,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call half @llvm.amdgcn.struct.ptr.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store half %val, ptr addrspace(3) %ptr
@@ -218,6 +339,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x half> %val, ptr addrspace(3) %ptr
@@ -230,6 +358,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4f16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x half> %val, ptr addrspace(3) %ptr
@@ -242,6 +377,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b16 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store i16 %val, ptr addrspace(3) %ptr
@@ -254,6 +396,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b32 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v2i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b32 v0, v1
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <2 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <2 x i16> %val, ptr addrspace(3) %ptr
@@ -266,6 +415,13 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: ds_write_b64 v0, [[VAL]]
define amdgpu_ps void @struct_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) {
+; CHECK-LABEL: struct_ptr_buffer_load_v4i16:
+; CHECK: ; %bb.0: ; %main_body
+; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ds_write_b64 v0, v[1:2]
+; CHECK-NEXT: s_endpgm
main_body:
%val = call <4 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0)
store <4 x i16> %val, ptr addrspace(3) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index ab035b9de04b9d..f9ff7609755a93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN,SI-NOHSA,FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI,VI-NOHSA,GCN,FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
@@ -15,6 +16,38 @@
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_x:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_x:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_x:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.X, KC0[1].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.x() #0
store i32 %0, ptr addrspace(1) %out
@@ -30,6 +63,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_y:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_y:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x1c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_y:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.X, KC0[1].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.y() #0
store i32 %0, ptr addrspace(1) %out
@@ -45,6 +110,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_z:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_z:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_z:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.X, KC0[2].X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%0 = call i32 @llvm.r600.read.local.size.z() #0
store i32 %0, ptr addrspace(1) %out
@@ -58,6 +155,40 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xy:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xy:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_xy:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[1].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
@@ -77,6 +208,42 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s2, s[0:1], 0x6
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s2, s4
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dword s5, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_xz:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[2].X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
@@ -95,6 +262,42 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_yz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x7
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s0, s0, s1
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s2
+; SI-NEXT: s_mov_b32 s5, s3
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_yz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s0, s0, s1
+; VI-NEXT: s_mov_b32 s4, s2
+; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_yz:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MULLO_INT * T1.X, KC0[1].W, KC0[2].X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%y = call i32 @llvm.r600.read.local.size.y() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
@@ -116,6 +319,45 @@ entry:
; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_xyz:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x6
+; SI-NEXT: s_load_dword s2, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mul_i32 s4, s4, s5
+; SI-NEXT: s_add_i32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_xyz:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x18
+; VI-NEXT: s_load_dword s6, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_mul_i32 s4, s4, s5
+; VI-NEXT: s_add_i32 s4, s4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_xyz:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: MULLO_INT * T0.X, KC0[1].Z, KC0[1].W,
+; EG-NEXT: ADD_INT T0.X, PS, KC0[2].X,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
@@ -133,6 +375,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_x_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x6
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_x_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_x_known_bits:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.x() #0
%shl = shl i32 %size, 16
@@ -148,6 +422,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_y_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x7
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_y_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x1c
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_y_known_bits:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[1].W, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.y() #0
%shl = shl i32 %size, 16
@@ -163,6 +469,38 @@ entry:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) {
+; SI-LABEL: local_size_z_known_bits:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dword s4, s[0:1], 0x8
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; VI-LABEL: local_size_z_known_bits:
+; VI: ; %bb.0: ; %entry
+; VI-NEXT: s_load_dword s4, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_endpgm
+;
+; EG-LABEL: local_size_z_known_bits:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T1.X, KC0[2].X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
entry:
%size = call i32 @llvm.r600.read.local.size.z() #0
%shl = shl i32 %size, 16
@@ -176,3 +514,8 @@ declare i32 @llvm.r600.read.local.size.y() #0
declare i32 @llvm.r600.read.local.size.z() #0
attributes #0 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; GCN: {{.*}}
+; SI-NOHSA: {{.*}}
+; VI-NOHSA: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 4cc469b9b49a06..f7eb42a5f93227 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9PLUS,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
@@ -82,6 +83,1028 @@
; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, [[CLAMP_IDX]], off{{$}}
define amdgpu_ps float @ps_main(i32 %idx) {
+; SI-LABEL: ps_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: ps_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: ps_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: ps_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: ps_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: ps_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: ps_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: ps_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: ps_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: ps_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -135,6 +1158,1028 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_vs float @vs_main(i32 %idx) {
+; SI-LABEL: vs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: vs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: vs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: vs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: vs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: vs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: vs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: vs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: vs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: vs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -185,6 +2230,1032 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_cs float @cs_main(i32 %idx) {
+; SI-LABEL: cs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: cs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: cs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: cs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: cs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s4, s0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: cs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v27, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v27
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v25, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v26, v5
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v24, v7
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v27
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: cs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: cs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v27, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v27
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, v7
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v27
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[21:24], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: cs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: cs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -217,6 +3288,1025 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_hs float @hs_main(i32 %idx) {
+; SI-LABEL: hs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: hs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: hs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: hs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: hs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: hs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: hs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: hs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: hs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: hs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -268,6 +4358,1025 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_gs float @gs_main(i32 %idx) {
+; SI-LABEL: gs_main:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_mov_b32 s3, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s0, s0, s4
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s1, s1, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: gs_main:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s4, s0
+; VI-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s3, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s0, s0, s4
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: gs_main:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s3, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: gs_main:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s3, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: gs_main:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s3, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s0, s0, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s1, s1, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[0:3], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[0:3], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[0:3], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[0:3], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: gs_main:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: gs_main:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: gs_main:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: gs_main:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: gs_main:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v23, v21 :: v_dual_mov_b32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v6, 0x3f5f2ee2 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e319356
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v11, 0xbe31934f :: v_dual_mov_b32 v4, v6
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v9, 0xb702e758 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, 0xb7043519 :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v13 :: v_dual_mov_b32 v33, v22
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%r = fadd float %v1, %v2
@@ -327,6 +5436,1032 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
+; SI-LABEL: hs_ir_uses_scratch_offset:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s11, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s8, s8, s6
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; SI-NEXT: s_mov_b32 s2, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: hs_ir_uses_scratch_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s11, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s8, s8, s6
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; VI-NEXT: s_mov_b32 s2, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: hs_ir_uses_scratch_offset:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: hs_ir_uses_scratch_offset:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: hs_ir_uses_scratch_offset:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: hs_ir_uses_scratch_offset:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: hs_ir_uses_scratch_offset:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v30, v13
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_mov_b32 s2, s5
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%f = fadd float %v1, %v2
@@ -382,6 +6517,1032 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
; GFX11-FLATSCR: scratch_load_b32 {{v[0-9]+}}, {{v[0-9]+}}, off
define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
+; SI-LABEL: gs_ir_uses_scratch_offset:
+; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s11, 0xe8f000
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: s_add_u32 s8, s8, s6
+; SI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_addc_u32 s9, s9, 0
+; SI-NEXT: v_add_i32_e32 v1, vcc, 0x200, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; SI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; SI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; SI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; SI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; SI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; SI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; SI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; SI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; SI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; SI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; SI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; SI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; SI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; SI-NEXT: s_waitcnt expcnt(1)
+; SI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; SI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; SI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; SI-NEXT: s_waitcnt expcnt(3)
+; SI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; SI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; SI-NEXT: s_waitcnt expcnt(2)
+; SI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; SI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; SI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; SI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; SI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; SI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; SI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; SI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; SI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; SI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; SI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; SI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; SI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; SI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; SI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; SI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; SI-NEXT: s_mov_b32 s2, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: gs_ir_uses_scratch_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; VI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s11, 0xe80000
+; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_u32 s8, s8, s6
+; VI-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_addc_u32 s9, s9, 0
+; VI-NEXT: v_add_u32_e32 v1, vcc, 0x200, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; VI-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; VI-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; VI-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; VI-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; VI-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; VI-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; VI-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; VI-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; VI-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; VI-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; VI-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; VI-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; VI-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; VI-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; VI-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; VI-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; VI-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; VI-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; VI-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; VI-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; VI-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; VI-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; VI-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; VI-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; VI-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; VI-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; VI-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; VI-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; VI-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; VI-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; VI-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; VI-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; VI-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; VI-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; VI-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; VI-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; VI-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; VI-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; VI-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; VI-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; VI-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; VI-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; VI-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; VI-NEXT: s_mov_b32 s2, s5
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_add_f32_e32 v0, v0, v1
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-MUBUF-LABEL: gs_ir_uses_scratch_offset:
+; GFX9-MUBUF: ; %bb.0:
+; GFX9-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX9-MUBUF-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX9-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbf20e7f4
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f523be1
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v6, 0x3f638e37
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:320
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:316
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:312
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:308
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:304
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:300
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:296
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:292
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:288
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd8a3
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:284
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbeae29dc
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbe31934f
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:280
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:276
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:272
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:264
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:260
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:256
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e31934f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89c
+; GFX9-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbe319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:252
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v9, 0x3e319356
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29dc
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:236
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3efcd89f
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf20e7f5
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v13, 0xbf3d349e
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX9-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:268
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:248
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:240
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:232
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:228
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:224
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:200
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f5
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v1, 0x200, v0
+; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 0, v0
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:196
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f4
+; GFX9-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_nop 0
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:832
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:828
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:824
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:820
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3703c499
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3f3d349c
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:816
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:812
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:808
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:804
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:800
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:796
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:792
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:788
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:784
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:780
+; GFX9-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX9-MUBUF-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:772
+; GFX9-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:764
+; GFX9-MUBUF-NEXT: v_mov_b32_e32 v5, 0xbf5f2ee2
+; GFX9-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:760
+; GFX9-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:756
+; GFX9-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:752
+; GFX9-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:748
+; GFX9-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:744
+; GFX9-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:740
+; GFX9-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:736
+; GFX9-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:732
+; GFX9-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:728
+; GFX9-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:724
+; GFX9-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:720
+; GFX9-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:716
+; GFX9-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:712
+; GFX9-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:708
+; GFX9-MUBUF-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
+; GFX9-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX9-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX9-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W32-MUBUF-LABEL: gs_ir_uses_scratch_offset:
+; GFX10_W32-MUBUF: ; %bb.0:
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W32-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W32-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W32-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W32-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W32-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W32-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W32-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W32-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W32-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W32-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W32-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W32-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX10_W64-MUBUF-LABEL: gs_ir_uses_scratch_offset:
+; GFX10_W64-MUBUF: ; %bb.0:
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s10, -1
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v2, 0x3f3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v4, 0x3f5f2ee2
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v3, 0x3f523be1
+; GFX10_W64-MUBUF-NEXT: s_add_u32 s8, s8, s5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbefcd8a3
+; GFX10_W64-MUBUF-NEXT: s_addc_u32 s9, s9, 0
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v8, 0xbefcd89f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:320
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:316
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:312
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:308
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:304
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:300
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:296
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:292
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:288
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:284
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:280
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:276
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:272
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v1, 0xbe319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0xbe31934f
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0xb702e758
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3e31934f
+; GFX10_W64-MUBUF-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:268
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:264
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:260
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:256
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:252
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v7, 0x3e319356
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0x3eae29d8
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v11, 0x3eae29dc
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0x3efcd89c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v13, 0x3efcd89f
+; GFX10_W64-MUBUF-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:248
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:244
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:240
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:236
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:232
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v10, 0xbf20e7f5
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v15, 0xbf5f2ee3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v12, 0xbf3d349e
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf523be3
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v16, 0xbf638e39
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v6, 0x200, v0
+; GFX10_W64-MUBUF-NEXT: v_add_nc_u32_e32 v0, 0, v0
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:228
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:224
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:220
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:216
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:212
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v17, 0x3f20e7f5
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:208
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:204
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f20e7f4
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:200
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:196
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v18, 0x3703c499
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:832
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:828
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:824
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:820
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:816
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:812
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0x3f3d349c
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v19, 0xbf523be1
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:808
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:804
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:800
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:796
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:792
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:788
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:784
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:780
+; GFX10_W64-MUBUF-NEXT: v_mov_b32_e32 v14, 0xbf5f2ee2
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v15, off, s[8:11], 0 offset:776
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:772
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v16, off, s[8:11], 0 offset:768
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:764
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v14, off, s[8:11], 0 offset:760
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:756
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v19, off, s[8:11], 0 offset:752
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:748
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v12, off, s[8:11], 0 offset:744
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v17, off, s[8:11], 0 offset:740
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v13, off, s[8:11], 0 offset:736
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v11, off, s[8:11], 0 offset:732
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:728
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v18, off, s[8:11], 0 offset:724
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:720
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:716
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:712
+; GFX10_W64-MUBUF-NEXT: buffer_store_dword v10, off, s[8:11], 0 offset:708
+; GFX10_W64-MUBUF-NEXT: buffer_load_dword v1, v6, s[8:11], 0 offen
+; GFX10_W64-MUBUF-NEXT: s_mov_b32 s2, s5
+; GFX10_W64-MUBUF-NEXT: s_waitcnt vmcnt(0)
+; GFX10_W64-MUBUF-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10_W64-MUBUF-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
+; GFX9-FLATSCR: ; %bb.0:
+; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
+; GFX10-FLATSCR: ; %bb.0:
+; GFX10-FLATSCR-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-NEXT: s_mov_b32 s2, s7
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-NEXT: ; return to shader part epilog
+;
+; GFX9-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset:
+; GFX9-FLATSCR-PAL: ; %bb.0:
+; GFX9-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-NEXT: v_and_b32_e32 v23, 0x1fc, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-NEXT: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:320
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], s0 offset:240
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0xbf3d349e
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0x3efcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89c
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:288
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0xb702e758
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xbeae29dc
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xb7043519
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, 0xbe31934f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbe319356
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:224
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf523be3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf5f2ee3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f638e37
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbefcd89f
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:256
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0xbf638e39
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f20e7f5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v15
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v21
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:304
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:272
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:208
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:192
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0, v23
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v18
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v4
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v5
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v13, v1, off
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[19:22], s0 offset:768
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0xbf523be1
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0x3f3d349c
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v6
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], s0 offset:832
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:784
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v7
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, v5
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v17
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0x3703c499
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v17
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[18:21], s0 offset:752
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[7:10], s0 offset:736
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[3:6], s0 offset:816
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v11
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, v2
+; GFX9-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v0
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[8:11], s0 offset:800
+; GFX9-FLATSCR-PAL-NEXT: v_add_u32_e32 v1, 0x200, v23
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], s0 offset:720
+; GFX9-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[15:18], s0 offset:704
+; GFX9-FLATSCR-PAL-NEXT: scratch_load_dword v0, v1, off
+; GFX9-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX9-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v13, v0
+; GFX9-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX10-FLATSCR-PAL-LABEL: gs_ir_uses_scratch_offset:
+; GFX10-FLATSCR-PAL: ; %bb.0:
+; GFX10-FLATSCR-PAL-NEXT: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL-NEXT: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xbf20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v9, 0xbeae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:320
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v8
+; GFX10-FLATSCR-PAL-NEXT: v_and_b32_e32 v0, 0x1fc, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
+; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v35, 0, v0
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, 0xb7043519
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, 0xbe31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, 0xbe319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v17, 0xbf20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v13, 0x3eae29d8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v15, 0x3e31934f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v30, v18
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:256
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[12:15], off offset:240
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:224
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:208
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:192
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v17
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v31, 0xbf523be1
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, 0x3f3d349c
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v32, v7
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, v16
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v10, v35, off
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v21
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v36, v5
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v37, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v38, v6
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[4:7], off offset:832
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[27:30], off offset:816
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:800
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:784
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v33, 0xbf5f2ee2
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v34, v6
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, v18
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v8
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v1, v27
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v2, v14
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v3, v12
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, v11
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, v9
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[35:38], off offset:768
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[31:34], off offset:752
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[23:26], off offset:736
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:720
+; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[16:19], off offset:704
+; GFX10-FLATSCR-PAL-NEXT: scratch_load_dword v0, v39, off
+; GFX10-FLATSCR-PAL-NEXT: s_mov_b32 s2, s5
+; GFX10-FLATSCR-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-PAL-NEXT: v_add_f32_e32 v0, v10, v0
+; GFX10-FLATSCR-PAL-NEXT: ; return to shader part epilog
+;
+; GFX11-FLATSCR-LABEL: gs_ir_uses_scratch_offset:
+; GFX11-FLATSCR: ; %bb.0:
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v1, 0xbf20e7f4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v28, 0x3703c499
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf5f2ee3
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v8, 0x3f3d349e
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v7, 0x3f523be1
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v17, 0xbf3d349e
+; GFX11-FLATSCR-NEXT: v_and_b32_e32 v37, 0x1fc, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v0, 0xbeae29dc :: v_dual_mov_b32 v23, v21
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v6, 0x3f5f2ee2
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v5, 0x3f638e37
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:320
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v7
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v24, 0xbf523be3
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[5:8], off offset:304
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:288
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v2, 0xbefcd89f :: v_dual_mov_b32 v27, v24
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v13, 0x3eae29dc :: v_dual_mov_b32 v34, v5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, 0xbefcd8a3
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v15, 0x3e319356 :: v_dual_mov_b32 v36, v6
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, 0xb702e758
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v10, 0xb7043519
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbe31934f
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v12, 0xbe319356 :: v_dual_mov_b32 v31, v19
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v18, 0xbf20e7f5
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
+; GFX11-FLATSCR-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v30, v13
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v1, 0x3f20e7f4
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v9, v18
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[13:16], off offset:240
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:224
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, v0
+; GFX11-FLATSCR-NEXT: s_clause 0x1
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[21:24], off offset:208
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:192
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v14, v37, off
+; GFX11-FLATSCR-NEXT: s_clause 0x2
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[1:4], off offset:832
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[28:31], off offset:816
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:800
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v29, 0xbf523be1
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v32, 0x3f3d349c :: v_dual_mov_b32 v5, v15
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v30, v7 :: v_dual_mov_b32 v31, v17
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v3, v12
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v4, v28
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v24, v19 :: v_dual_mov_b32 v35, v21
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:784
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v31, 0xbf5f2ee2 :: v_dual_mov_b32 v32, v6
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v27, v8 :: v_dual_mov_b32 v6, v13
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v19, v2
+; GFX11-FLATSCR-NEXT: s_clause 0x4
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[33:36], off offset:768
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[29:32], off offset:752
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[24:27], off offset:736
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[3:6], off offset:720
+; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[17:20], off offset:704
+; GFX11-FLATSCR-NEXT: scratch_load_b32 v0, v37, off offset:512
+; GFX11-FLATSCR-NEXT: s_mov_b32 s2, s5
+; GFX11-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLATSCR-NEXT: v_add_f32_e32 v0, v14, v0
+; GFX11-FLATSCR-NEXT: ; return to shader part epilog
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
%f = fadd float %v1, %v2
@@ -389,3 +7550,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
%r2 = insertvalue <{i32, i32, i32, float}> %r1, float %f, 3
ret <{i32, i32, i32, float}> %r2
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FLATSCR: {{.*}}
+; GCN: {{.*}}
+; GFX9PLUS: {{.*}}
+; GFX9_10-MUBUF: {{.*}}
+; MUBUF: {{.*}}
+; SIVI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 38672da3c647b0..1b2b9d68fff847 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SI,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX89,FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX9,GFX89,FUNC %s
@@ -16,6 +17,40 @@
; EG: LSHR * [[ADDR]]
; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
define amdgpu_kernel void @sext_in_reg_i1_i32(ptr addrspace(1) %out, i32 %in) #0 {
+; SI-LABEL: sext_in_reg_i1_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i1_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_bfe_i32 s0, s2, 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i1_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT * T1.X, KC0[2].Z, 0.0, 1,
%shl = shl i32 %in, 31
%sext = ashr i32 %shl, 31
store i32 %sext, ptr addrspace(1) %out
@@ -33,6 +68,59 @@ define amdgpu_kernel void @sext_in_reg_i1_i32(ptr addrspace(1) %out, i32 %in) #0
; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
; EG-NEXT: LSHR * [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+; SI-LABEL: sext_in_reg_i8_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i8_to_i32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_sext_i32_i8 s0, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i8_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i8_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add i32 %a, %b ; add to prevent folding into extload
%shl = shl i32 %c, 24
%ashr = ashr i32 %shl, 24
@@ -51,6 +139,59 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32(ptr addrspace(1) %out, i32 %a,
; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
; EG-NEXT: LSHR * [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+; SI-LABEL: sext_in_reg_i16_to_i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_sext_i32_i16 s2, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i16_to_i32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_sext_i32_i16 s0, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i16_to_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i16_to_i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%c = add i32 %a, %b ; add to prevent folding into extload
%shl = shl i32 %c, 16
%ashr = ashr i32 %shl, 16
@@ -69,6 +210,59 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i32(ptr addrspace(1) %out, i32 %a,
; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
; EG-NEXT: LSHR * [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_i8_to_v1i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i8_to_v1i32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_sext_i32_i8 s0, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i8_to_v1i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i8_to_v1i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add <1 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <1 x i32> %c, <i32 24>
%ashr = ashr <1 x i32> %shl, <i32 24>
@@ -83,6 +277,53 @@ define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(ptr addrspace(1) %out, <1 x i
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i1_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i1_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i1_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T0.Y, PV.X,
%c = shl i64 %a, %b
%shl = shl i64 %c, 63
%ashr = ashr i64 %shl, 63
@@ -97,6 +338,54 @@ define amdgpu_kernel void @sext_in_reg_i1_to_i64(ptr addrspace(1) %out, i64 %a,
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i8_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i8_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i8_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 56
%ashr = ashr i64 %shl, 56
@@ -112,6 +401,54 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i64(ptr addrspace(1) %out, i64 %a,
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i16_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i16_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T0.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 48
%ashr = ashr i64 %shl, 48
@@ -126,6 +463,53 @@ define amdgpu_kernel void @sext_in_reg_i16_to_i64(ptr addrspace(1) %out, i64 %a,
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
define amdgpu_kernel void @sext_in_reg_i32_to_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
+; SI-LABEL: sext_in_reg_i32_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dword s0, s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], s0
+; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i32_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], s8
+; GFX9-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x200000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i32_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, KC0[2].W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
%c = shl i64 %a, %b
%shl = shl i64 %c, 32
%ashr = ashr i64 %shl, 32
@@ -161,6 +545,63 @@ define amdgpu_kernel void @sext_in_reg_i32_to_i64(ptr addrspace(1) %out, i64 %a,
; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i1_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 1
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -188,6 +629,64 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(ptr addrspace(1) %out, ptr ad
; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i8_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i8_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i8_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -215,6 +714,64 @@ define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(ptr addrspace(1) %out, ptr ad
; SI: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[HI]]]
define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i16_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i16_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i16_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -239,6 +796,60 @@ define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(ptr addrspace(1) %out, ptr a
; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[LO]]:[[SHR]]]
define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
+; SI-LABEL: v_sext_in_reg_i32_to_i64:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i32_to_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i32_to_i64:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR T1.X, PV.W, literal.x,
+; EG-NEXT: ASHR * T0.Y, PV.X, literal.y,
+; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -265,6 +876,61 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(ptr addrspace(1) %out, ptr a
; EG: ASHR [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
+; SI-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_bfe_i32 s2, s2, 0x190001
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_add_i32 s0, s2, s3
+; GFX89-NEXT: s_bfe_i32 s0, s0, 0x190001
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_add_i32 s0, s2, s3
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190001
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_i1_in_i32_other_amount:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, KC0[2].W,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45)
%c = add i32 %a, %b
%x = shl i32 %c, 6
%y = ashr i32 %x, 7
@@ -288,6 +954,55 @@ define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(ptr addrspace(1) %
; EG: ASHR [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i1_in_v2i32_other_amount:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_bfe_i32 s4, s4, 0x190001
+; SI-NEXT: s_bfe_i32 s5, s2, 0x190001
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i1_in_v2i32_other_amount:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_bfe_i32 s1, s1, 0x190001
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x190001
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i1_in_v2i32_other_amount:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: ADD_INT T1.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.Y, PS, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: 7(9.809089e-45), 6(8.407791e-45)
+; EG-NEXT: ASHR T0.X, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45)
%c = add <2 x i32> %a, %b
%x = shl <2 x i32> %c, <i32 6, i32 6>
%y = ashr <2 x i32> %x, <i32 7, i32 7>
@@ -306,6 +1021,51 @@ define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(ptr addrspace(
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i1_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
+; SI-NEXT: s_bfe_i32 s5, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i1_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_bfe_i32 s1, s1, 0x10000
+; GFX9-NEXT: s_bfe_i32 s0, s0, 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i1_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 31, i32 31>
%ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -327,6 +1087,67 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(ptr addrspace(1) %out, <2 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v4i1_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s8
+; SI-NEXT: s_add_i32 s4, s5, s9
+; SI-NEXT: s_add_i32 s5, s6, s10
+; SI-NEXT: s_add_i32 s6, s7, s11
+; SI-NEXT: s_bfe_i32 s6, s6, 0x10000
+; SI-NEXT: s_bfe_i32 s5, s5, 0x10000
+; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
+; SI-NEXT: s_bfe_i32 s7, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v4i1_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s4, s4, s8
+; GFX9-NEXT: s_add_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s6, s6, s10
+; GFX9-NEXT: s_add_i32 s7, s7, s11
+; GFX9-NEXT: s_bfe_i32 s7, s7, 0x10000
+; GFX9-NEXT: s_bfe_i32 s6, s6, 0x10000
+; GFX9-NEXT: s_bfe_i32 s5, s5, 0x10000
+; GFX9-NEXT: s_bfe_i32 s4, s4, 0x10000
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v4i1_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[4].X, KC0[5].X,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].W, KC0[4].W,
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, 1,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Z, KC0[4].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Y, KC0[4].Y,
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, 1,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <4 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
%ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -344,6 +1165,52 @@ define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(ptr addrspace(1) %out, <4 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i8_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_sext_i32_i8 s4, s4
+; SI-NEXT: s_sext_i32_i8 s5, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i8_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_sext_i32_i8 s1, s1
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i8_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 24, i32 24>
%ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -365,6 +1232,70 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(ptr addrspace(1) %out, <2 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v4i8_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s8
+; SI-NEXT: s_add_i32 s4, s5, s9
+; SI-NEXT: s_add_i32 s5, s6, s10
+; SI-NEXT: s_add_i32 s6, s7, s11
+; SI-NEXT: s_sext_i32_i8 s6, s6
+; SI-NEXT: s_sext_i32_i8 s5, s5
+; SI-NEXT: s_sext_i32_i8 s4, s4
+; SI-NEXT: s_sext_i32_i8 s7, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v4i8_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s4, s4, s8
+; GFX9-NEXT: s_add_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s6, s6, s10
+; GFX9-NEXT: s_add_i32 s7, s7, s11
+; GFX9-NEXT: s_sext_i32_i8 s7, s7
+; GFX9-NEXT: s_sext_i32_i8 s6, s6
+; GFX9-NEXT: s_sext_i32_i8 s5, s5
+; GFX9-NEXT: s_sext_i32_i8 s4, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v2, s6
+; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v4i8_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[4].X, KC0[5].X,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].W, KC0[4].W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Z, KC0[4].Z,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[3].Y, KC0[4].Y,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%c = add <4 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
%ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -382,6 +1313,52 @@ define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, <4 x
; EG: BFE_INT [[RES]]
; EG: LSHR {{\*?}} [[ADDR]]
define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i16_to_v2i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_i32 s2, s4, s6
+; SI-NEXT: s_add_i32 s4, s5, s7
+; SI-NEXT: s_sext_i32_i16 s4, s4
+; SI-NEXT: s_sext_i32_i16 s5, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i16_to_v2i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_add_i32 s0, s4, s6
+; GFX9-NEXT: s_add_i32 s1, s5, s7
+; GFX9-NEXT: s_sext_i32_i16 s1, s1
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i16_to_v2i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: ALU clause starting at 4:
+; EG-NEXT: ADD_INT * T0.W, KC0[3].X, KC0[3].Z,
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].W, KC0[3].Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%c = add <2 x i32> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i32> %c, <i32 16, i32 16>
%ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
@@ -391,6 +1368,69 @@ define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(ptr addrspace(1) %out, <2
; FUNC-LABEL: {{^}}testcase:
define amdgpu_kernel void @testcase(ptr addrspace(1) %out, i8 %a) #0 {
+; SI-LABEL: testcase:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_bfe_i32 s5, s2, 0x10000
+; SI-NEXT: s_max_i32 s4, s4, 0
+; SI-NEXT: s_and_b32 s2, s5, s2
+; SI-NEXT: s_xor_b32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: testcase:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sext_i32_i8 s0, s2
+; GFX9-NEXT: s_max_i32 s0, s0, 0
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-NEXT: s_xor_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: testcase:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, 1,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, T0.X,
+; EG-NEXT: MAX_INT * T1.W, PV.Z, 0.0,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: XOR_INT * T0.W, PS, PV.W,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and_a_1 = and i8 %a, 1
%cmp_eq = icmp eq i8 %and_a_1, 0
%cmp_slt = icmp slt i8 %a, 0
@@ -403,6 +1443,69 @@ define amdgpu_kernel void @testcase(ptr addrspace(1) %out, i8 %a) #0 {
; FUNC-LABEL: {{^}}testcase_3:
define amdgpu_kernel void @testcase_3(ptr addrspace(1) %out, i8 %a) #0 {
+; SI-LABEL: testcase_3:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_bfe_i32 s5, s2, 0x10000
+; SI-NEXT: s_max_i32 s4, s4, 0
+; SI-NEXT: s_and_b32 s2, s5, s2
+; SI-NEXT: s_xor_b32 s4, s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: testcase_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_sext_i32_i8 s0, s2
+; GFX9-NEXT: s_max_i32 s0, s0, 0
+; GFX9-NEXT: s_bitcmp1_b32 s2, 0
+; GFX9-NEXT: s_cselect_b32 s1, s2, 0
+; GFX9-NEXT: s_xor_b32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: testcase_3:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, 1,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, T0.X,
+; EG-NEXT: MAX_INT * T1.W, PV.Z, 0.0,
+; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: XOR_INT * T0.W, PS, PV.W,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%and_a_1 = and i8 %a, 1
%cmp_eq = icmp eq i8 %and_a_1, 0
%cmp_slt = icmp slt i8 %a, 0
@@ -419,6 +1522,92 @@ define amdgpu_kernel void @testcase_3(ptr addrspace(1) %out, i8 %a) #0 {
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 {
+; SI-LABEL: vgpr_sext_in_reg_v4i8_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 8
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: vgpr_sext_in_reg_v4i8_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
+; GFX9-NEXT: v_add_u32_e32 v1, v5, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v7, v3
+; GFX9-NEXT: v_bfe_i32 v3, v3, 0, 8
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: vgpr_sext_in_reg_v4i8_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].W,
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ADD_INT * T0.W, T0.W, T1.W,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Z, T1.Z,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.X, T1.X,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
%loada = load <4 x i32>, ptr addrspace(1) %a, align 16
%loadb = load <4 x i32>, ptr addrspace(1) %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -432,6 +1621,92 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(ptr addrspace(1) %out,
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 {
+; SI-LABEL: vgpr_sext_in_reg_v4i16_to_v4i32:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_mov_b32 s11, 0xf000
+; SI-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s14, s10
+; SI-NEXT: s_mov_b32 s15, s11
+; SI-NEXT: s_mov_b32 s2, s10
+; SI-NEXT: s_mov_b32 s3, s11
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s12, s6
+; SI-NEXT: s_mov_b32 s13, s7
+; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[12:15], 0
+; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT: s_mov_b32 s8, s4
+; SI-NEXT: s_mov_b32 s9, s5
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: vgpr_sext_in_reg_v4i16_to_v4i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
+; GFX9-NEXT: v_add_u32_e32 v1, v5, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_add_u32_e32 v3, v7, v3
+; GFX9-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: vgpr_sext_in_reg_v4i16_to_v4i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 1 @6
+; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
+; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 10:
+; EG-NEXT: MOV T0.X, KC0[2].Z,
+; EG-NEXT: MOV * T1.X, KC0[2].W,
+; EG-NEXT: ALU clause starting at 12:
+; EG-NEXT: ADD_INT * T0.W, T0.W, T1.W,
+; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Z, T1.Z,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Z, PS, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.Y, T1.Y,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T0.X, T1.X,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
%loada = load <4 x i32>, ptr addrspace(1) %a, align 16
%loadb = load <4 x i32>, ptr addrspace(1) %b, align 16
%c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -447,6 +1722,86 @@ define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(ptr addrspace(1) %out
; GCN-NOT: bfe
; GCN: buffer_store_short
define amdgpu_kernel void @sext_in_reg_to_illegal_type(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %src) #0 {
+; SI-LABEL: sext_in_reg_to_illegal_type:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s10, s6
+; SI-NEXT: s_mov_b32 s11, s7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s8, s2
+; SI-NEXT: s_mov_b32 s9, s3
+; SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_max_i32_e32 v0, 0, v0
+; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: sext_in_reg_to_illegal_type:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_mov_b32 s10, s6
+; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s2
+; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
+; GFX89-NEXT: s_mov_b32 s4, s0
+; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: v_max_i32_e32 v0, 0, v0
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_to_illegal_type:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s10, s6
+; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_i32_e32 v0, 0, v0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_to_illegal_type:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
+; EG-NEXT: MAX_INT T0.W, PV.W, 0.0,
+; EG-NEXT: LSHL * T1.W, PS, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tmp5 = load i8, ptr addrspace(1) %src, align 1
%tmp2 = sext i8 %tmp5 to i32
%tmp2.5 = icmp sgt i32 %tmp2, 0
@@ -473,6 +1828,70 @@ define amdgpu_kernel void @sext_in_reg_to_illegal_type(ptr addrspace(1) nocaptur
; SI: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, i64 %s.val) #0 {
+; SI-LABEL: v_sext_in_reg_i1_to_i64_move_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_bfe_i32 v2, v2, 0, 1
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_and_b32_e32 v3, s9, v3
+; SI-NEXT: v_and_b32_e32 v2, s8, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_to_i64_move_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_to_i64_move_use:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: BFE_INT * T1.W, PV.W, 0.0, 1,
+; EG-NEXT: AND_INT * T0.Y, PV.W, KC0[3].Z,
+; EG-NEXT: AND_INT T0.X, T1.W, KC0[3].Y,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -503,6 +1922,69 @@ define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(ptr addrspace(1) %ou
; SI: buffer_store_dwordx2 v[[[RESULT_LO]]:[[RESULT_HI]]]
; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v[[[RESULT_LO]]:[[RESULT_HI]]]
define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, i64 %s.val) #0 {
+; SI-LABEL: v_sext_in_reg_i32_to_i64_move_use:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
+; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v2
+; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; SI-NEXT: v_and_b32_e32 v3, s9, v3
+; SI-NEXT: v_and_b32_e32 v2, s8, v2
+; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i32_to_i64_move_use:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, v[0:1]
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX9-NEXT: v_and_b32_e32 v1, s3, v1
+; GFX9-NEXT: v_and_b32_e32 v0, s2, v0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i32_to_i64_move_use:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: ALU clause starting at 11:
+; EG-NEXT: AND_INT * T1.W, T0.X, literal.x,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, T0.X, PV.W,
+; EG-NEXT: AND_INT * T2.W, T0.X, literal.x,
+; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: CNDE_INT * T1.W, PS, PV.W, 0.0,
+; EG-NEXT: AND_INT T0.X, PV.W, KC0[3].Y,
+; EG-NEXT: ASHR T1.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.X, PS, literal.x,
+; EG-NEXT: AND_INT * T0.Y, PV.W, KC0[3].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid
@@ -529,6 +2011,75 @@ define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(ptr addrspace(1) %o
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
+; SI-LABEL: s_sext_in_reg_i1_i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x10000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: s_sext_in_reg_i1_i16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_lshl_b32 s4, s4, 15
+; GFX89-NEXT: s_sext_i32_i16 s4, s4
+; GFX89-NEXT: s_lshr_b32 s4, s4, 15
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i1_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s4, s4, 15
+; GFX9-NEXT: s_sext_i32_i16 s4, s4
+; GFX9-NEXT: s_lshr_b32 s4, s4, 15
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i1_i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, 1,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ld = load i32, ptr addrspace(4) %ptr
%in = trunc i32 %ld to i16
%shl = shl i16 %in, 15
@@ -548,6 +2099,77 @@ define amdgpu_kernel void @s_sext_in_reg_i1_i16(ptr addrspace(1) %out, ptr addrs
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
+; SI-LABEL: s_sext_in_reg_i2_i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_load_dword s2, s[2:3], 0x0
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x20000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX89-LABEL: s_sext_in_reg_i2_i16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_lshl_b32 s4, s4, 14
+; GFX89-NEXT: s_sext_i32_i16 s4, s4
+; GFX89-NEXT: s_lshr_b32 s4, s4, 14
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i2_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s4, s4, 14
+; GFX9-NEXT: s_sext_i32_i16 s4, s4
+; GFX9-NEXT: s_lshr_b32 s4, s4, 14
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i2_i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 30(4.203895e-44), 3(4.203895e-45)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%ld = load i32, ptr addrspace(4) %ptr
%in = trunc i32 %ld to i16
%shl = shl i16 %in, 14
@@ -562,6 +2184,48 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16(ptr addrspace(1) %out, ptr addrs
; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrspace(1) %ptr) #0 {
+; SI-LABEL: v_sext_in_reg_i1_i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s4, s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_write_b16 v0, v1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @41, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL * T0.W, T0.X, 1,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU 2, @42, KC0[CB0:0-32], KC1[]
+; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LDS_SHORT_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr i16, ptr addrspace(1) %ptr, i32 %tid
%out.gep = getelementptr i16, ptr addrspace(3) %out, i32 %tid
@@ -583,6 +2247,64 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(ptr addrspace(3) %out, ptr addrs
; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, i16 %s.val) nounwind {
+; SI-LABEL: v_sext_in_reg_i1_i16_nonload:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s12, s[0:1], 0x9
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_load_ushort v1, v[0:1], s[8:11], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, v1, v2
+; SI-NEXT: v_bfe_i32 v1, v1, 0, 1
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: ds_write_b16 v0, v1
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_sext_in_reg_i1_i16_nonload:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: v_lshlrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 1
+; GFX9-NEXT: ds_write_b16 v0, v1
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: v_sext_in_reg_i1_i16_nonload:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 1, @43, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL * T0.W, T0.X, 1,
+; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT: ALU 0, @44, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
+; EG-NEXT: ALU 5, @45, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T1.W, T1.X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, T0.X, PV.W,
+; EG-NEXT: BFE_INT T1.W, PV.W, 0.0, 1,
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
+; EG-NEXT: LDS_SHORT_WRITE * T0.W, T1.W,
+; EG-NEXT: RETURN
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%a.gep = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid
%b.gep = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid
@@ -609,6 +2331,60 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(ptr addrspace(3) %out, p
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
+; SI-LABEL: s_sext_in_reg_i2_i16_arg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0x20000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i2_i16_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s2, 14
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 14
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i2_i16_arg:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 30(4.203895e-44), 3(4.203895e-45)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i16 %in, 14
%sext = ashr i16 %shl, 14
store i16 %sext, ptr addrspace(1) %out
@@ -626,6 +2402,58 @@ define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(ptr addrspace(1) %out, i16 %
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
+; SI-LABEL: s_sext_in_reg_i8_i16_arg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sext_i32_i8 s4, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i8_i16_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s2, 8
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 8
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i8_i16_arg:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i16 %in, 8
%sext = ashr i16 %shl, 8
store i16 %sext, ptr addrspace(1) %out
@@ -643,6 +2471,60 @@ define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(ptr addrspace(1) %out, i16 %
; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16 %in) #0 {
+; SI-LABEL: s_sext_in_reg_i15_i16_arg:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_sext_in_reg_i15_i16_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_lshl_b32 s0, s2, 1
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: s_sext_in_reg_i15_i16_arg:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @8, KC0[], KC1[]
+; EG-NEXT: TEX 0 @6
+; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT: ALU clause starting at 8:
+; EG-NEXT: MOV * T0.X, 0.0,
+; EG-NEXT: ALU clause starting at 9:
+; EG-NEXT: LSHL T0.W, T0.X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 17(2.382207e-44), 3(4.203895e-45)
+; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
+; EG-NEXT: 17(2.382207e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T0.X, PV.W, PS,
+; EG-NEXT: LSHL * T0.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T0.Y, 0.0,
+; EG-NEXT: MOV * T0.Z, 0.0,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%shl = shl i16 %in, 1
%sext = ashr i16 %shl, 1
store i16 %sext, ptr addrspace(1) %out
@@ -654,6 +2536,68 @@ define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(ptr addrspace(1) %out, i16
; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]]
; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]]
define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i1_to_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s3, 16
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: s_bfe_i32 s2, s2, 0x10000
+; SI-NEXT: s_bfe_i32 s3, s4, 0x10000
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i1_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i1_to_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 9, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 44, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
+; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, 1,
+; EG-NEXT: BFE_INT * T0.W, T0.W, 0.0, 1,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i16> %c, <i16 15, i16 15>
%ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
@@ -669,6 +2613,94 @@ define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(ptr addrspace(1) %out, <2 x
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(ptr addrspace(1) %out, <3 x i16> %a, <3 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v3i1_to_v3i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s8, s4, 16
+; SI-NEXT: s_lshr_b32 s9, s6, 16
+; SI-NEXT: s_add_i32 s5, s5, s7
+; SI-NEXT: s_add_i32 s4, s4, s6
+; SI-NEXT: s_add_i32 s8, s8, s9
+; SI-NEXT: s_bfe_i32 s4, s4, 0x10000
+; SI-NEXT: s_bfe_i32 s5, s5, 0x10000
+; SI-NEXT: s_bfe_i32 s6, s8, 0x10000
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_lshl_b32 s5, s6, 16
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v3i1_to_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_pk_add_u16 v0, s5, v0
+; GFX9-NEXT: v_pk_add_u16 v1, s4, v1
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 15, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 15, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v3i1_to_v3i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @18, KC0[], KC1[]
+; EG-NEXT: TEX 5 @6
+; EG-NEXT: ALU 25, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T8.X, 0
+; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
+; EG-NEXT: VTX_READ_16 T7.X, T5.X, 52, #3
+; EG-NEXT: VTX_READ_16 T8.X, T5.X, 46, #3
+; EG-NEXT: VTX_READ_16 T9.X, T5.X, 54, #3
+; EG-NEXT: VTX_READ_16 T10.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_16 T5.X, T5.X, 56, #3
+; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: MOV * T5.X, 0.0,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T10.X, T5.X,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.W, PS, 0.0, 1,
+; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T5.X, PV.W, PS,
+; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: MOV T5.Z, 0.0,
+; EG-NEXT: ADD_INT * T1.W, T8.X, T9.X,
+; EG-NEXT: ADD_INT * T2.W, T6.X, T7.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, 1,
+; EG-NEXT: BFE_INT * T1.W, T1.W, 0.0, 1,
+; EG-NEXT: LSHR T6.X, T0.W, literal.x,
+; EG-NEXT: AND_INT T0.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
+; EG-NEXT: 2(2.802597e-45), -65536(nan)
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT T7.X, PV.W, PS,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15>
%ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15>
@@ -681,6 +2713,72 @@ define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(ptr addrspace(1) %out, <3 x
; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]]
; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]]
define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i2_to_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s3, 16
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: s_bfe_i32 s2, s2, 0x20000
+; SI-NEXT: s_bfe_i32 s3, s4, 0x20000
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i2_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 14, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 14, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i2_to_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
+; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
+; EG-NEXT: LSHL T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T0.W, T0.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: ASHR T0.W, PS, literal.x,
+; EG-NEXT: ASHR * T1.W, PV.W, literal.x,
+; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, PS, literal.x,
+; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i16> %c, <i16 14, i16 14>
%ashr = ashr <2 x i16> %shl, <i16 14, i16 14>
@@ -693,6 +2791,69 @@ define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(ptr addrspace(1) %out, <2 x
; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]]
; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]]
define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v2i8_to_v2i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s4, s2, 16
+; SI-NEXT: s_lshr_b32 s5, s3, 16
+; SI-NEXT: s_add_i32 s2, s2, s3
+; SI-NEXT: s_add_i32 s4, s4, s5
+; SI-NEXT: s_sext_i32_i8 s2, s2
+; SI-NEXT: s_sext_i32_i8 s3, s4
+; SI-NEXT: s_and_b32 s2, s2, 0xffff
+; SI-NEXT: s_lshl_b32 s3, s3, 16
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_mov_b32 s5, s1
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v2i8_to_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-NEXT: v_pk_add_u16 v0, s2, v0
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: s_mov_b32 s4, s0
+; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v2i8_to_v2i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @14, KC0[], KC1[]
+; EG-NEXT: TEX 3 @6
+; EG-NEXT: ALU 10, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
+; EG-NEXT: CF_END
+; EG-NEXT: PAD
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
+; EG-NEXT: VTX_READ_16 T6.X, T4.X, 46, #3
+; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
+; EG-NEXT: VTX_READ_16 T4.X, T4.X, 44, #3
+; EG-NEXT: ALU clause starting at 14:
+; EG-NEXT: MOV * T4.X, 0.0,
+; EG-NEXT: ALU clause starting at 15:
+; EG-NEXT: ADD_INT * T0.W, T5.X, T6.X,
+; EG-NEXT: ADD_INT * T1.W, T7.X, T4.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T0.W, T0.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T4.X, PV.W, PS,
+; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <2 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <2 x i16> %c, <i16 8, i16 8>
%ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
@@ -708,6 +2869,95 @@ define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(ptr addrspace(1) %out, <2 x
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(ptr addrspace(1) %out, <3 x i16> %a, <3 x i16> %b) #0 {
+; SI-LABEL: sext_in_reg_v3i8_to_v3i16:
+; SI: ; %bb.0:
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_lshr_b32 s8, s4, 16
+; SI-NEXT: s_lshr_b32 s9, s6, 16
+; SI-NEXT: s_add_i32 s5, s5, s7
+; SI-NEXT: s_add_i32 s4, s4, s6
+; SI-NEXT: s_add_i32 s8, s8, s9
+; SI-NEXT: s_sext_i32_i8 s4, s4
+; SI-NEXT: s_sext_i32_i8 s5, s5
+; SI-NEXT: s_sext_i32_i8 s6, s8
+; SI-NEXT: s_and_b32 s4, s4, 0xffff
+; SI-NEXT: v_mov_b32_e32 v0, s5
+; SI-NEXT: s_lshl_b32 s5, s6, 16
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX9-LABEL: sext_in_reg_v3i8_to_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s7
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-NEXT: v_pk_add_u16 v0, s5, v0
+; GFX9-NEXT: v_pk_add_u16 v1, s4, v1
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
+; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GFX9-NEXT: s_endpgm
+;
+; EG-LABEL: sext_in_reg_v3i8_to_v3i16:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 0, @18, KC0[], KC1[]
+; EG-NEXT: TEX 5 @6
+; EG-NEXT: ALU 26, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T8.X, 0
+; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X
+; EG-NEXT: CF_END
+; EG-NEXT: Fetch clause starting at 6:
+; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
+; EG-NEXT: VTX_READ_16 T7.X, T5.X, 52, #3
+; EG-NEXT: VTX_READ_16 T8.X, T5.X, 46, #3
+; EG-NEXT: VTX_READ_16 T9.X, T5.X, 54, #3
+; EG-NEXT: VTX_READ_16 T10.X, T5.X, 48, #3
+; EG-NEXT: VTX_READ_16 T5.X, T5.X, 56, #3
+; EG-NEXT: ALU clause starting at 18:
+; EG-NEXT: MOV * T5.X, 0.0,
+; EG-NEXT: ALU clause starting at 19:
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT * T1.W, T10.X, T5.X,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: BFE_INT T1.W, PS, 0.0, literal.x,
+; EG-NEXT: AND_INT * T2.W, PV.W, literal.y,
+; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
+; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T2.W, PS, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT: LSHL T5.X, PV.W, PS,
+; EG-NEXT: LSHL * T5.W, literal.x, PS,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: MOV T5.Y, 0.0,
+; EG-NEXT: MOV T5.Z, 0.0,
+; EG-NEXT: ADD_INT * T1.W, T8.X, T9.X,
+; EG-NEXT: ADD_INT * T2.W, T6.X, T7.X,
+; EG-NEXT: BFE_INT T0.Z, PV.W, 0.0, literal.x,
+; EG-NEXT: BFE_INT * T1.W, T1.W, 0.0, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T6.X, T0.W, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.y,
+; EG-NEXT: AND_INT * T1.W, PV.Z, literal.z,
+; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: OR_INT T7.X, PV.W, PS,
+; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%c = add <3 x i16> %a, %b ; add to prevent folding into extload
%shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8>
%ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8>
@@ -719,3 +2969,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 1164d3bd770887..d55e201394a318 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN %s
@@ -9,6 +10,24 @@
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_35(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_35:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 3, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 35
store i64 %shl, ptr addrspace(1) %out
@@ -21,6 +40,24 @@ define amdgpu_kernel void @lshr_i64_35(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_63(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 63
store i64 %shl, ptr addrspace(1) %out
@@ -33,6 +70,24 @@ define amdgpu_kernel void @lshr_i64_63(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_33(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_33:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 33
store i64 %shl, ptr addrspace(1) %out
@@ -44,6 +99,23 @@ define amdgpu_kernel void @lshr_i64_33(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @lshr_i64_32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_i64_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = lshr i64 %val, 32
store i64 %shl, ptr addrspace(1) %out
@@ -59,6 +131,24 @@ define amdgpu_kernel void @lshr_i64_32(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
define amdgpu_kernel void @lshr_and_i64_35(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: lshr_and_i64_35:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_bfe_u32 v0, v0, 8, 23
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
%shl = lshr i64 %and, 40
@@ -74,6 +164,24 @@ define amdgpu_kernel void @lshr_and_i64_35(ptr addrspace(1) %out, ptr addrspace(
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @shl_i64_const_35(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: shl_i64_const_35:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 35
store i64 %shl, ptr addrspace(1) %out
@@ -85,6 +193,23 @@ define amdgpu_kernel void @shl_i64_const_35(ptr addrspace(1) %out, ptr addrspace
; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @shl_i64_const_32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: shl_i64_const_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v1, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 32
store i64 %shl, ptr addrspace(1) %out
@@ -97,6 +222,24 @@ define amdgpu_kernel void @shl_i64_const_32(ptr addrspace(1) %out, ptr addrspace
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
define amdgpu_kernel void @shl_i64_const_63(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: shl_i64_const_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 63
store i64 %shl, ptr addrspace(1) %out
@@ -107,6 +250,23 @@ define amdgpu_kernel void @shl_i64_const_63(ptr addrspace(1) %out, ptr addrspace
; GCN-LABEL: {{^}}ashr_i64_const_32:
define amdgpu_kernel void @ashr_i64_const_32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: ashr_i64_const_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = ashr i64 %val, 32
store i64 %shl, ptr addrspace(1) %out
@@ -115,6 +275,24 @@ define amdgpu_kernel void @ashr_i64_const_32(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: {{^}}ashr_i64_const_63:
define amdgpu_kernel void @ashr_i64_const_63(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: ashr_i64_const_63:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = ashr i64 %val, 63
store i64 %shl, ptr addrspace(1) %out
@@ -126,6 +304,23 @@ define amdgpu_kernel void @ashr_i64_const_63(ptr addrspace(1) %out, ptr addrspac
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
; GCN: buffer_store_dword [[SHL]]
define amdgpu_kernel void @trunc_shl_31_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_31_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 31
%trunc = trunc i64 %shl to i32
@@ -138,6 +333,23 @@ define amdgpu_kernel void @trunc_shl_31_i32_i64(ptr addrspace(1) %out, ptr addrs
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
; GCN: buffer_store_short [[SHL]]
define amdgpu_kernel void @trunc_shl_15_i16_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_15_i16_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 15, v0
+; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 15
%trunc = trunc i64 %shl to i16
@@ -150,6 +362,23 @@ define amdgpu_kernel void @trunc_shl_15_i16_i64(ptr addrspace(1) %out, ptr addrs
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
; GCN: buffer_store_short [[SHL]]
define amdgpu_kernel void @trunc_shl_15_i16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_15_i16_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 15, v0
+; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in
%shl = shl i32 %val, 15
%trunc = trunc i32 %shl to i16
@@ -162,6 +391,23 @@ define amdgpu_kernel void @trunc_shl_15_i16_i32(ptr addrspace(1) %out, ptr addrs
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
; GCN: buffer_store_byte [[SHL]]
define amdgpu_kernel void @trunc_shl_7_i8_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_7_i8_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0
+; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 7
%trunc = trunc i64 %shl to i8
@@ -175,6 +421,24 @@ define amdgpu_kernel void @trunc_shl_7_i8_i64(ptr addrspace(1) %out, ptr addrspa
; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
; GCN: buffer_store_byte [[AND]]
define amdgpu_kernel void @trunc_shl_1_i2_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_1_i2_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: v_and_b32_e32 v0, 2, v0
+; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 1
%trunc = trunc i64 %shl to i2
@@ -187,6 +451,23 @@ define amdgpu_kernel void @trunc_shl_1_i2_i64(ptr addrspace(1) %out, ptr addrspa
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
; GCN: buffer_store_dword [[SHL]]
define amdgpu_kernel void @trunc_shl_1_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_1_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 1
%trunc = trunc i64 %shl to i32
@@ -199,6 +480,23 @@ define amdgpu_kernel void @trunc_shl_1_i32_i64(ptr addrspace(1) %out, ptr addrsp
; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
; GCN: buffer_store_dword [[SHL]]
define amdgpu_kernel void @trunc_shl_16_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_16_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 16
%trunc = trunc i64 %shl to i32
@@ -210,6 +508,15 @@ define amdgpu_kernel void @trunc_shl_16_i32_i64(ptr addrspace(1) %out, ptr addrs
; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword [[ZERO]]
define amdgpu_kernel void @trunc_shl_33_i32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_33_i32_i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 33
%trunc = trunc i64 %shl to i32
@@ -223,6 +530,24 @@ define amdgpu_kernel void @trunc_shl_33_i32_i64(ptr addrspace(1) %out, ptr addrs
; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
; GCN: buffer_store_dwordx2 v[[[RESLO]]:[[RESHI]]]
define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_16_v2i32_v2i64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) %in
%shl = shl <2 x i64> %val, <i64 16, i64 16>
%trunc = trunc <2 x i64> %shl to <2 x i32>
@@ -236,6 +561,26 @@ define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(ptr addrspace(1) %out, ptr a
; GCN: buffer_store_dword v[[RESLO]]
; GCN: buffer_store_dwordx2 v[[[RESLO]]:[[RESHI]]]
define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: trunc_shl_31_i32_i64_multi_use:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %in
%shl = shl i64 %val, 31
%trunc = trunc i64 %shl to i32
@@ -249,6 +594,22 @@ define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(ptr addrspace(1) %out,
; GCN-NOT: v_lshl_b64
; GCN-NOT: v_lshlrev_b64
define amdgpu_kernel void @trunc_shl_and31(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_and31:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, s8, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp3 = and i32 %arg2, 31
@@ -265,6 +626,23 @@ bb:
; GCN-NOT: v_lshl_b64
; GCN-NOT: v_lshlrev_b64
define amdgpu_kernel void @trunc_shl_and30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_and30:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_and_b32 s4, s8, 30
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp3 = and i32 %arg2, 30
@@ -279,6 +657,22 @@ bb:
; Negative test, wrong constant
; GCN: v_lshl_b64
define amdgpu_kernel void @trunc_shl_wrong_and63(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_wrong_and63:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s8
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp3 = and i32 %arg2, 63
@@ -293,6 +687,22 @@ bb:
; Negative test, shift can be full 64 bit
; GCN: v_lshl_b64
define amdgpu_kernel void @trunc_shl_no_and(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: trunc_shl_no_and:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s0, s6
+; GCN-NEXT: s_mov_b32 s1, s7
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], s8
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_endpgm
bb:
%tmp = load i64, ptr addrspace(1) %arg, align 8
%tmp4 = zext i32 %arg2 to i64
@@ -308,6 +718,23 @@ bb:
; GCN-DAG: v_lshl_b64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 5
; GCN-DAG: v_lshl_b64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 6
define amdgpu_kernel void @trunc_shl_vec_vec(ptr addrspace(1) %arg) {
+; GCN-LABEL: trunc_shl_vec_vec:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], 6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], 4
+; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], 3
+; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 5
+; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NEXT: s_endpgm
bb:
%v = load <4 x i64>, ptr addrspace(1) %arg, align 32
%shl = shl <4 x i64> %v, <i64 3, i64 4, i64 5, i64 6>
diff --git a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
index 0f60790ff396fc..7df9ff34f4feec 100644
--- a/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck %s
; Check transformation shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
@@ -10,6 +11,21 @@
; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]]
; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
define amdgpu_kernel void @add_const_offset(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: add_const_offset:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, 0xc80, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_load_dword v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = add i32 %id, 200
@@ -27,6 +43,21 @@ bb:
; CHECK: v_add_u32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]]
; CHECK: load_dword v{{[0-9]+}}, v[[[ADDRLO]]:
define amdgpu_kernel void @or_const_offset(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: or_const_offset:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NEXT: v_or_b32_e32 v0, 0x1000, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: flat_load_dword v2, v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: s_endpgm
bb:
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%add = or i32 %id, 256
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 9127cc3ffb34ee..2968a63b150ad2 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
@@ -314,6 +315,18 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
define void @shl_add_ptr_combine_2use_lds(i32 %idx) #0 {
+; GCN-LABEL: shl_add_ptr_combine_2use_lds:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: ds_write_b32 v1, v2 offset:32
+; GCN-NEXT: v_mov_b32_e32 v1, 10
+; GCN-NEXT: ds_write_b32 v0, v1 offset:64
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
@@ -364,6 +377,18 @@ define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 {
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16
; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32
define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 {
+; GCN-LABEL: shl_add_ptr_combine_2use_private:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:16
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v1, 10
+; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx = zext i16 %idx.arg to i32
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 2
@@ -414,6 +439,19 @@ define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.a
; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
+; GCN-LABEL: shl_or_ptr_combine_2use_lds:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GCN-NEXT: ds_write_b32 v0, v2 offset:8
+; GCN-NEXT: v_mov_b32_e32 v0, 10
+; GCN-NEXT: ds_write_b32 v1, v0 offset:16
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx.shl = shl i32 %idx, 1
%idx.add = or i32 %idx.shl, 1
%shl0 = shl i32 %idx.add, 3
@@ -431,6 +469,19 @@ define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}}{{$}}
; GCN-DAG: ds_write_b32 [[SCALE1]], v{{[0-9]+}}{{$}}
define void @shl_or_ptr_not_combine_2use_lds(i32 %idx) #0 {
+; GCN-LABEL: shl_or_ptr_not_combine_2use_lds:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, 1, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 9
+; GCN-NEXT: s_mov_b32 m0, -1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: ds_write_b32 v1, v2
+; GCN-NEXT: v_mov_b32_e32 v1, 10
+; GCN-NEXT: ds_write_b32 v0, v1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%idx.add = or i32 %idx, 1
%shl0 = shl i32 %idx.add, 3
%shl1 = shl i32 %idx.add, 4
diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 1c4ac88c9ed398..8e2d464bad2ddf 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@@ -16,6 +17,47 @@
; SI: buffer_store_byte
define amdgpu_kernel void @store_i1(ptr addrspace(5) %out) {
+; EG-LABEL: store_i1:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 15, @0, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, 1, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i1:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 15, @0, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, 1, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
store i1 true, ptr addrspace(5) %out
ret void
@@ -47,6 +89,55 @@ entry:
; SI: buffer_store_byte
define amdgpu_kernel void @store_i8(ptr addrspace(5) %out, i8 %in) {
+; EG-LABEL: store_i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: MOV * T1.X, 0.0,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_8 T1.X, T1.X, 40, #3
+; EG-NEXT: ALU 11, @2, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T1.X, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT: MOV * T1.X, 0.0,
+; CM-NEXT: TEX 0 @0
+; CM-NEXT: VTX_READ_8 T1.X, T1.X, 40, #3
+; CM-NEXT: ALU 11, @2, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, T1.X, T1.W, BS:VEC_120/SCL_212
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
store i8 %in, ptr addrspace(5) %out
ret void
@@ -75,6 +166,58 @@ entry:
; SI: buffer_store_short
define amdgpu_kernel void @store_i16(ptr addrspace(5) %out, i16 %in) {
+; EG-LABEL: store_i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @3, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: MOV * T1.X, 0.0,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_16 T1.X, T1.X, 40, #3
+; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, T1.X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 4, @3, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T0.X, T(0 + AR.x).X+,
+; CM-NEXT: MOV * T1.X, 0.0,
+; CM-NEXT: TEX 0 @0
+; CM-NEXT: VTX_READ_16 T1.X, T1.X, 40, #3
+; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, T1.X, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
store i16 %in, ptr addrspace(5) %out
ret void
@@ -105,6 +248,92 @@ entry:
; CM: MOVA_INT
; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
define amdgpu_kernel void @store_i24(ptr addrspace(5) %out, i24 %in) {
+; EG-LABEL: store_i24:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 37, @5, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].Z, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; EG-NEXT: LSHL T1.W, T3.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: MOV * T2.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T2.W, KC0[2].Z, literal.x, PS,
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, T0.X, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i24:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 38, @5, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.X, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T0.Z, literal.x, PV.W,
+; CM-NEXT: MOV * T1.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; CM-NEXT: BFE_UINT T2.Z, KC0[2].Z, literal.x, PV.W,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
store i24 %in, ptr addrspace(5) %out
ret void
@@ -123,6 +352,25 @@ entry:
; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
; CM-NOT: MOVA_INT
define amdgpu_kernel void @store_i25(ptr addrspace(5) %out, i25 %in) {
+; EG-LABEL: store_i25:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 33554431(9.403954e-38), 2(2.802597e-45)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i25:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT T0.Z, KC0[2].Z, literal.x,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 33554431(9.403954e-38), 2(2.802597e-45)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
store i25 %in, ptr addrspace(5) %out
ret void
@@ -144,6 +392,59 @@ entry:
; SI: buffer_store_short
define amdgpu_kernel void @store_v2i8(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 21, @7, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.Z, KC0[3].X, literal.x,
+; EG-NEXT: AND_INT T2.W, KC0[2].W, literal.y,
+; EG-NEXT: LSHL * T1.W, PV.W, literal.z,
+; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T3.W, literal.x, PS,
+; EG-NEXT: OR_INT * T2.W, PV.Z, PV.W,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T3.W, PV.W,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: AND_INT T3.W, T0.Y, PS,
+; EG-NEXT: LSHL * T1.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 21, @7, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Y, KC0[3].X, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.y,
+; CM-NEXT: LSHL * T1.W, PV.W, literal.z,
+; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43)
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.Z,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i8>
store <2 x i8> %0, ptr addrspace(5) %out
@@ -175,6 +476,87 @@ entry:
; SI: buffer_store_byte
define amdgpu_kernel void @store_v2i8_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i8_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 34, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T1.W, T3.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, 1,
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[3].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
+; EG-NEXT: LSHL * T0.W, T2.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i8_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 36, @8, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, 1,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].X, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i8>
store <2 x i8> %0, ptr addrspace(5) %out, align 1
@@ -194,6 +576,31 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v2i16(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHL T0.W, KC0[3].X, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y,
+; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHL T0.Z, KC0[3].X, literal.x,
+; CM-NEXT: AND_INT * T0.W, KC0[2].W, literal.y,
+; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i16>
store <2 x i16> %0, ptr addrspace(5) %out
@@ -226,6 +633,89 @@ entry:
; SI: buffer_store_short
; SI: buffer_store_short
define amdgpu_kernel void @store_v2i16_unaligned(ptr addrspace(5) %out, <2 x i32> %in) {
+; EG-LABEL: store_v2i16_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 35, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T1.W, T3.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[3].X, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
+; EG-NEXT: LSHL * T0.W, T2.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2i16_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].X, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <2 x i32> %in to <2 x i16>
store <2 x i16> %0, ptr addrspace(5) %out, align 2
@@ -243,6 +733,51 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i8(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 17, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, KC0[3].Z, literal.x,
+; EG-NEXT: LSHL T0.W, PV.W, literal.y,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.z,
+; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.x,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: AND_INT * T1.W, KC0[3].Y, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: OR_INT T0.W, PV.W, PS,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 17, @11, KC0[CB0:0-32], KC1[]
+; CM-NEXT: AND_INT * T0.W, KC0[3].W, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, KC0[3].Z, literal.x,
+; CM-NEXT: LSHL T0.Z, PV.W, literal.y,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.z,
+; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
+; CM-NEXT: LSHL * T0.W, PV.Y, literal.x,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: AND_INT * T0.W, KC0[3].Y, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, ptr addrspace(5) %out
@@ -302,6 +837,182 @@ entry:
; SI: buffer_store_byte
; SI-NOT: buffer_store_dword
define amdgpu_kernel void @store_v4i8_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i8_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 81, @12, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR * T1.W, PV.W, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[4].X, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T3.W, T0.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T2.W, KC0[3].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T2.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, 1,
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[3].Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.W, T0.Y, PV.W,
+; EG-NEXT: LSHL * T0.W, T3.W, T0.W,
+; EG-NEXT: LSHR T5.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: LSHL T0.W, T3.W, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.y,
+; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; EG-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT T0.W, PS, PV.W,
+; EG-NEXT: LSHL * T1.W, T2.W, literal.y,
+; EG-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; EG-NEXT: OR_INT T1.Z, PV.W, PS,
+; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT: LSHL T2.W, literal.x, PS,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i8_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 84, @12, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR * T1.W, PV.W, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[4].X, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T2.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, 1,
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, KC0[3].Z, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: LSHR T4.Z, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T4.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: LSHL T1.Z, T1.Z, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.y,
+; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
+; CM-NEXT: AND_INT T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT T1.Z, PV.W, PV.Z,
+; CM-NEXT: LSHL * T0.W, T0.Z, literal.y,
+; CM-NEXT: 3(4.203895e-45), 16(2.242078e-44)
+; CM-NEXT: OR_INT T2.Y, PV.Z, PV.W,
+; CM-NEXT: AND_INT T0.Z, KC0[3].Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T4.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, ptr addrspace(5) %out, align 1
@@ -413,6 +1124,440 @@ entry:
; SI: buffer_store_byte
; SI-NOT: buffer_store_dword
define amdgpu_kernel void @store_v8i8_unaligned(ptr addrspace(5) %out, <8 x i32> %in) {
+; EG-LABEL: store_v8i8_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 106, @13, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[5].X, literal.y,
+; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[6].X, literal.y,
+; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: AND_INT * T0.W, KC0[4].Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -65281(nan), 8(1.121039e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: AND_INT * T0.W, KC0[5].Z, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -65281(nan), 8(1.121039e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV T0.Y, T3.X,
+; EG-NEXT: AND_INT * T0.W, KC0[4].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV T0.Y, T2.X,
+; EG-NEXT: AND_INT * T0.W, KC0[5].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
+; EG-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T2.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT: AND_INT T0.W, KC0[4].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: -256(nan), 255(3.573311e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T3.X, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.Z, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T3.W, literal.x, PV.W,
+; EG-NEXT: LSHR * T4.W, T0.Y, literal.y,
+; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT: NOT_INT * T3.W, PV.W,
+; EG-NEXT: AND_INT T0.Z, T0.Z, PV.W,
+; EG-NEXT: LSHL T1.W, T4.W, T1.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV * T0.Z, T2.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, T3.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: MOV * T3.W, literal.y,
+; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; EG-NEXT: BFE_UINT T5.W, T0.Y, literal.x, PS,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T1.Y, PS,
+; EG-NEXT: LSHL T1.W, PV.W, T1.W,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, 1,
+; EG-NEXT: LSHR T5.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, T2.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T2.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T4.W, T0.Y, literal.x, T3.W,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T1.Y, PS,
+; EG-NEXT: LSHL * T1.W, PV.W, T1.W,
+; EG-NEXT: ALU 102, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR T4.W, KC0[2].Y, literal.x,
+; EG-NEXT: OR_INT * T1.W, T2.W, T1.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T5.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T0.W, T1.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T1.W, literal.x, PV.W,
+; EG-NEXT: LSHR * T4.W, T0.Z, literal.y,
+; EG-NEXT: 255(3.573311e-43), 24(3.363116e-44)
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; EG-NEXT: LSHL T0.W, T4.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T2.W, T0.Z, literal.x, T3.W,
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T2.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: BFE_UINT T3.W, T0.Z, literal.x, T3.W,
+; EG-NEXT: NOT_INT * T1.W, PV.W,
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.Z, T0.Y, PS,
+; EG-NEXT: LSHL T0.W, PV.W, T0.W,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T3.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T2.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T0.W, T1.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x,
+; EG-NEXT: AND_INT T1.W, KC0[5].Y, literal.y,
+; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
+; EG-NEXT: -256(nan), 255(3.573311e-43)
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PS,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T1.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T0.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T3.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v8i8_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 107, @13, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[5].X, literal.y,
+; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T2.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[6].X, literal.y,
+; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: AND_INT * T0.W, KC0[4].Z, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -65281(nan), 8(1.121039e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: AND_INT * T0.W, KC0[5].Z, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -65281(nan), 8(1.121039e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV T0.Y, T3.X,
+; CM-NEXT: AND_INT * T0.W, KC0[4].W, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV T0.Y, T2.X,
+; CM-NEXT: AND_INT * T0.W, KC0[5].W, literal.x,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
+; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: AND_INT T1.Y, PV.Y, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[4].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: -256(nan), 255(3.573311e-43)
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOV * T3.X, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Z, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, T0.Y, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, T0.Z, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOV * T0.Z, T2.X,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: MOV * T2.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
+; CM-NEXT: BFE_UINT T3.Z, T0.Y, literal.x, PV.W,
+; CM-NEXT: NOT_INT * T3.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, T1.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, 1,
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T3.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T3.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T1.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: BFE_UINT T1.Z, T0.Y, literal.x, T2.W,
+; CM-NEXT: NOT_INT * T3.W, PV.W,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T1.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: ALU 104, @14, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T1.Z, KC0[2].Y, literal.x,
+; CM-NEXT: OR_INT * T0.W, T2.Z, T0.W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T3.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T2.Z, T1.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T2.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 7(9.809089e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Z, T0.Z, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.x, T2.W,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 5(7.006492e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T3.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: BFE_UINT T1.Z, T0.Z, literal.x, T2.W,
+; CM-NEXT: NOT_INT * T1.W, PV.W,
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T0.W,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T3.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T0.W, T0.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Y, T0.Z, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[5].Y, literal.y,
+; CM-NEXT: LSHL * T0.W, PV.W, literal.z,
+; CM-NEXT: -256(nan), 255(3.573311e-43)
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <8 x i32> %in to <8 x i8>
store <8 x i8> %0, ptr addrspace(5) %out, align 1
@@ -446,6 +1591,109 @@ entry:
; SI: buffer_store_short
; SI-NOT: buffer_store_dword
define amdgpu_kernel void @store_v4i8_halfaligned(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i8_halfaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 46, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: AND_INT * T2.W, KC0[3].Z, literal.y,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T0.Z, literal.x, PV.W,
+; EG-NEXT: LSHL T2.W, T2.W, literal.y,
+; EG-NEXT: AND_INT * T3.W, KC0[3].Y, literal.z,
+; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: OR_INT T3.W, PV.W, PS,
+; EG-NEXT: NOT_INT * T4.W, PV.Z,
+; EG-NEXT: AND_INT T0.Z, T0.Y, PS,
+; EG-NEXT: LSHL T1.W, PV.W, T1.W,
+; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T4.W, PS, literal.x,
+; EG-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT T0.Z, T3.W, literal.x,
+; EG-NEXT: AND_INT T0.W, KC0[3].W, literal.y,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.z,
+; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT T1.Z, PS, T2.W,
+; EG-NEXT: LSHL T0.W, PV.W, literal.x,
+; EG-NEXT: LSHL * T1.W, PV.Z, literal.y,
+; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
+; EG-NEXT: LSHL T2.W, literal.x, PS,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PS, literal.x,
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: AND_INT T2.W, T0.Y, PS,
+; EG-NEXT: LSHL * T0.W, PV.W, T1.W,
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T4.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i8_halfaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 46, @15, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[3].Z, literal.x,
+; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
+; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Y, literal.x, PV.W,
+; CM-NEXT: LSHL T0.Z, PV.Z, literal.y,
+; CM-NEXT: AND_INT * T2.W, KC0[3].Y, literal.z,
+; CM-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: OR_INT T1.Z, PV.Z, PV.W,
+; CM-NEXT: NOT_INT * T2.W, PV.Y,
+; CM-NEXT: AND_INT T0.Y, T0.Y, PV.W,
+; CM-NEXT: LSHL T1.Z, PV.Z, T1.W,
+; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T2.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T2.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T0.Y, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT T1.Y, T1.W, literal.x,
+; CM-NEXT: AND_INT T1.Z, KC0[3].W, literal.y,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.z,
+; CM-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; CM-NEXT: OR_INT T2.Y, PV.W, T0.Z,
+; CM-NEXT: LSHL T0.Z, PV.Z, literal.x,
+; CM-NEXT: LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT: 16(2.242078e-44), 3(4.203895e-45)
+; CM-NEXT: LSHL T1.Z, literal.x, PV.W,
+; CM-NEXT: OR_INT * T1.W, PV.Y, PV.Z,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: NOT_INT * T1.W, PV.Z,
+; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.Y, PV.W,
+; CM-NEXT: LSHL * T0.W, PV.Z, T0.W,
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i8>
store <4 x i8> %0, ptr addrspace(5) %out, align 2
@@ -463,6 +1711,25 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
+; EG-LABEL: store_f32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_f32:
+; CM: ; %bb.0:
+; CM-NEXT: ALU 4, @16, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[2].Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
store float %in, ptr addrspace(5) %out
ret void
}
@@ -483,6 +1750,83 @@ define amdgpu_kernel void @store_f32(ptr addrspace(5) %out, float %in) {
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i16(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 33, @17, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[4].X, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: AND_INT * T1.W, KC0[3].W, literal.y,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV T3.X, PV.W,
+; EG-NEXT: MOV * T0.Y, T2.X,
+; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
+; EG-NEXT: LSHL * T1.W, KC0[3].Z, literal.y,
+; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; EG-NEXT: OR_INT * T0.W, PV.W, PS,
+; EG-NEXT: MOV * T2.X, PV.W,
+; EG-NEXT: MOV * T0.Y, PV.X,
+; EG-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT: AND_INT T0.W, KC0[3].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: -65536(nan), 65535(9.183409e-41)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T1.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T1.W, PS, literal.x,
+; EG-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T2.X, PS,
+; EG-NEXT: MOV * T0.Y, T3.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 33, @17, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[4].X, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV * T3.X, PV.W,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: AND_INT * T0.W, KC0[3].W, literal.y,
+; CM-NEXT: -65536(nan), 65535(9.183409e-41)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV T3.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T2.X,
+; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
+; CM-NEXT: LSHL * T0.W, KC0[3].Z, literal.y,
+; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
+; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
+; CM-NEXT: MOV * T2.X, PV.W,
+; CM-NEXT: MOV * T0.Y, PV.X,
+; CM-NEXT: AND_INT T0.Y, PV.Y, literal.x,
+; CM-NEXT: AND_INT T0.Z, KC0[3].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: -65536(nan), 65535(9.183409e-41)
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.Y, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T1.Z, PV.W, literal.x,
+; CM-NEXT: OR_INT * T0.W, PV.Y, PV.Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOV T2.X, PV.W,
+; CM-NEXT: MOV * T0.Y, T3.X,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc <4 x i32> %in to <4 x i16>
store <4 x i16> %0, ptr addrspace(5) %out
@@ -507,6 +1851,37 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v2f32(ptr addrspace(5) %out, float %a, float %b) {
+; EG-LABEL: store_v2f32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 10, @18, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Z, KC0[2].Z,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[2].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v2f32:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 10, @18, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Y, KC0[2].Z,
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[2].W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: RETURN
entry:
%0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
%1 = insertelement <2 x float> %0, float %b, i32 1
@@ -536,6 +1911,49 @@ entry:
; SI: buffer_store_dword
define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) nounwind {
+; EG-LABEL: store_v3i32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 16, @19, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: MOV T0.X, KC0[3].Y,
+; EG-NEXT: LSHR T0.Y, PS, literal.x,
+; EG-NEXT: MOV T1.Z, KC0[3].Z,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[3].W,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.X,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v3i32:
+; CM: ; %bb.0:
+; CM-NEXT: ALU 16, @19, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Y, KC0[3].Y,
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: MOV T1.Y, KC0[3].Z,
+; CM-NEXT: LSHR T0.Z, PV.Z, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[3].W,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: RETURN
store <3 x i32> %a, ptr addrspace(5) %out, align 16
ret void
}
@@ -566,6 +1984,61 @@ define amdgpu_kernel void @store_v3i32(ptr addrspace(5) %out, <3 x i32> %a) noun
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i32(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i32:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 22, @20, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Y, KC0[3].Y,
+; EG-NEXT: LSHR T0.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: MOV T1.X, KC0[4].X,
+; EG-NEXT: LSHR T1.Y, PS, literal.x,
+; EG-NEXT: MOV T1.Z, KC0[3].W,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[3].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i32:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 22, @20, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Z, KC0[3].Y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; CM-NEXT: LSHR T0.X, PV.W, literal.x,
+; CM-NEXT: MOV T1.Y, KC0[4].X,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: MOV T2.Y, KC0[3].W,
+; CM-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[3].Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
store <4 x i32> %in, ptr addrspace(5) %out
ret void
@@ -597,6 +2070,61 @@ entry:
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4i32_unaligned(ptr addrspace(5) %out, <4 x i32> %in) {
+; EG-LABEL: store_v4i32_unaligned:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT: MOV T0.Y, KC0[3].Y,
+; EG-NEXT: LSHR T0.Z, PV.W, literal.x,
+; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
+; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; EG-NEXT: MOV T1.X, KC0[4].X,
+; EG-NEXT: LSHR T1.Y, PS, literal.x,
+; EG-NEXT: MOV T1.Z, KC0[3].W,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: MOV * T1.W, KC0[3].Z,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4i32_unaligned:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 22, @21, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR T0.Y, KC0[2].Y, literal.x,
+; CM-NEXT: MOV T0.Z, KC0[3].Y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; CM-NEXT: LSHR T0.X, PV.W, literal.x,
+; CM-NEXT: MOV T1.Y, KC0[4].X,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
+; CM-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: MOV T2.Y, KC0[3].W,
+; CM-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; CM-NEXT: MOV * T0.W, KC0[3].Z,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T2.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Z,
+; CM-NEXT: RETURN
entry:
store <4 x i32> %in, ptr addrspace(5) %out, align 4
ret void
@@ -629,6 +2157,85 @@ entry:
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %in) {
+; EG-LABEL: store_v4f32:
+; EG: ; %bb.0:
+; EG-NEXT: ALU 34, @22, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.Z, KC0[2].Y, literal.x,
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
+; EG-NEXT: LSHR T0.X, PS, literal.x,
+; EG-NEXT: ADD_INT T0.Y, KC0[2].Z, literal.y,
+; EG-NEXT: ADD_INT T1.Z, KC0[2].Z, literal.z,
+; EG-NEXT: ADD_INT T1.W, KC0[2].Z, literal.w,
+; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
+; EG-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; EG-NEXT: 8(1.121039e-44), 4(5.605194e-45)
+; EG-NEXT: LSHR T1.X, PS, literal.x,
+; EG-NEXT: LSHR T1.Y, PV.W, literal.x,
+; EG-NEXT: LSHR T1.Z, PV.Z, literal.x,
+; EG-NEXT: LSHR T1.W, PV.Y, literal.x,
+; EG-NEXT: LSHR * T2.W, KC0[2].Z, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T0.Y, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.W,
+; EG-NEXT: MOV * T1.W, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; EG-NEXT: MOV * T1.Z, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.Y,
+; EG-NEXT: MOV * T1.Y, T(0 + AR.x).X+,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_v4f32:
+; CM: ; %bb.0:
+; CM-NEXT: ALU 34, @22, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 8(1.121039e-44), 12(1.681558e-44)
+; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; CM-NEXT: LSHR T0.Y, PV.W, literal.x,
+; CM-NEXT: LSHR T0.Z, PV.Z, literal.x,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
+; CM-NEXT: 2(2.802597e-45), 4(5.605194e-45)
+; CM-NEXT: LSHR T1.X, PV.W, literal.x,
+; CM-NEXT: ADD_INT T1.Y, KC0[2].Z, literal.y,
+; CM-NEXT: ADD_INT T1.Z, KC0[2].Z, literal.z,
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.w,
+; CM-NEXT: 2(2.802597e-45), 12(1.681558e-44)
+; CM-NEXT: 8(1.121039e-44), 4(5.605194e-45)
+; CM-NEXT: LSHR T2.X, PV.W, literal.x,
+; CM-NEXT: LSHR T2.Y, PV.Z, literal.x,
+; CM-NEXT: LSHR T1.Z, PV.Y, literal.x,
+; CM-NEXT: LSHR * T0.W, KC0[2].Z, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T0.W, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.Z,
+; CM-NEXT: MOV * T1.Y, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.Y,
+; CM-NEXT: MOV * T1.Z, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T2.X,
+; CM-NEXT: MOV * T1.W, T(0 + AR.x).X+,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T1.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Z,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Y,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.Y,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.X,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.W,
+; CM-NEXT: RETURN
%1 = load <4 x float>, ptr addrspace(5) %in
store <4 x float> %1, ptr addrspace(5) %out
ret void
@@ -647,6 +2254,50 @@ define amdgpu_kernel void @store_v4f32(ptr addrspace(5) %out, ptr addrspace(5) %
; SI: buffer_store_byte
define amdgpu_kernel void @store_i64_i8(ptr addrspace(5) %out, i64 %in) {
+; EG-LABEL: store_i64_i8:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 16, @23, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i64_i8:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 17, @23, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc i64 %in to i8
store i8 %0, ptr addrspace(5) %out
@@ -666,6 +2317,50 @@ entry:
; SI: buffer_store_short
define amdgpu_kernel void @store_i64_i16(ptr addrspace(5) %out, i64 %in) {
+; EG-LABEL: store_i64_i16:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 16, @24, KC0[CB0:0-32], KC1[]
+; EG-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; EG-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT: LSHL T2.W, literal.x, PV.W,
+; EG-NEXT: AND_INT * T3.W, KC0[2].W, literal.x,
+; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT: NOT_INT * T2.W, PV.W,
+; EG-NEXT: AND_INT T2.W, T0.X, PV.W,
+; EG-NEXT: LSHL * T1.W, T3.W, T1.W,
+; EG-NEXT: OR_INT * T1.W, PV.W, PS,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: store_i64_i16:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 17, @24, KC0[CB0:0-32], KC1[]
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV T0.X, T(0 + AR.x).X+,
+; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T1.W, PV.W, literal.x,
+; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
+; CM-NEXT: LSHL * T2.W, literal.x, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T0.Z, KC0[2].W, literal.x,
+; CM-NEXT: NOT_INT * T2.W, PV.W,
+; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
+; CM-NEXT: AND_INT T1.Z, T0.X, PV.W,
+; CM-NEXT: LSHL * T1.W, PV.Z, T1.W,
+; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T1.W,
+; CM-NEXT: RETURN
entry:
%0 = trunc i64 %in to i16
store i16 %0, ptr addrspace(5) %out
@@ -692,6 +2387,41 @@ entry:
; SI: buffer_store_dword
; SI: buffer_store_dword
define amdgpu_kernel void @vecload2(ptr addrspace(5) nocapture %out, ptr addrspace(4) nocapture %mem) #0 {
+; EG-LABEL: vecload2:
+; EG: ; %bb.0: ; %entry
+; EG-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
+; EG-NEXT: MOV * T0.X, KC0[2].Z,
+; EG-NEXT: TEX 0 @0
+; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; EG-NEXT: ALU 8, @26, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; EG-NEXT: LSHR T0.W, PV.W, literal.x,
+; EG-NEXT: LSHR * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT: MOVA_INT * AR.x (MASKED), PS,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.X,
+; EG-NEXT: MOVA_INT * AR.x (MASKED), T0.W,
+; EG-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; EG-NEXT: RETURN
+;
+; CM-LABEL: vecload2:
+; CM: ; %bb.0: ; %entry
+; CM-NEXT: ALU 0, @25, KC0[CB0:0-32], KC1[]
+; CM-NEXT: MOV * T0.X, KC0[2].Z,
+; CM-NEXT: TEX 0 @0
+; CM-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
+; CM-NEXT: ALU 8, @26, KC0[CB0:0-32], KC1[]
+; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
+; CM-NEXT: LSHR T0.Z, PV.W, literal.x,
+; CM-NEXT: LSHR * T0.W, KC0[2].Y, literal.x,
+; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+; CM-NEXT: MOVA_INT * AR.x (MASKED), PV.W,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.X,
+; CM-NEXT: MOVA_INT * AR.x (MASKED), T0.Z,
+; CM-NEXT: MOV * T(0 + AR.x).X+, T0.Y,
+; CM-NEXT: RETURN
entry:
%0 = load i32, ptr addrspace(4) %mem, align 4
%arrayidx1.i = getelementptr inbounds i32, ptr addrspace(4) %mem, i64 1
@@ -743,3 +2473,6 @@ entry:
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; SI: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/Windows/alloca.ll b/llvm/test/CodeGen/ARM/Windows/alloca.ll
index e014d287db6e90..d28836d1c70631 100644
--- a/llvm/test/CodeGen/ARM/Windows/alloca.ll
+++ b/llvm/test/CodeGen/ARM/Windows/alloca.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -O0 -mtriple thumbv7-windows-itanium -filetype asm -o - %s | FileCheck %s
; RUN: llc -O0 -mtriple thumbv7-windows-msvc -filetype asm -o - %s | FileCheck %s
; RUN: llc -O0 -mtriple thumbv7-windows-mingw32 -filetype asm -o - %s | FileCheck %s
@@ -25,3 +26,5 @@ entry:
; CHECK: bl __chkstk
; CHECK: sub.w sp, sp, r4
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/Windows/vla.ll b/llvm/test/CodeGen/ARM/Windows/vla.ll
index 459db0c290b5a0..3adca905850c2d 100644
--- a/llvm/test/CodeGen/ARM/Windows/vla.ll
+++ b/llvm/test/CodeGen/ARM/Windows/vla.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -o - %s \
; RUN: | FileCheck %s -check-prefix CHECK-SMALL-CODE
; RUN: llc -mtriple=thumbv7-windows-itanium -mcpu=cortex-a9 -code-model=large -o - %s \
@@ -26,3 +27,6 @@ entry:
; CHECK-LARGE-CODE: movt [[IP]], :upper16:__chkstk
; CHECK-LARGE-CODE: blx [[IP]]
; CHECK-LARGE-CODE: sub.w sp, sp, r4
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-LARGE-CODE: {{.*}}
+; CHECK-SMALL-CODE: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/and-cmpz.ll b/llvm/test/CodeGen/ARM/and-cmpz.ll
index 1f72307f12a682..e1c9fe52911b99 100644
--- a/llvm/test/CodeGen/ARM/and-cmpz.ll
+++ b/llvm/test/CodeGen/ARM/and-cmpz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=thumbv7m-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=T2
; RUN: llc -mtriple=thumbv6m-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=T1
@@ -7,6 +8,24 @@
; T2-NEXT: it
; T1-NEXT: bmi
define i32 @single_bit(i32 %p) {
+; T2-LABEL: single_bit:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: lsls r0, r0, #23
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it pl
+; T2-NEXT: movpl r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: single_bit:
+; T1: @ %bb.0:
+; T1-NEXT: lsls r0, r0, #23
+; T1-NEXT: bmi .LBB0_2
+; T1-NEXT: @ %bb.1: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB0_2: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
%a = and i32 %p, 256
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -24,6 +43,26 @@ false:
; T2-NEXT: it
; T1-NEXT: bmi
define i32 @single_bit_multi_use(i32 %p, ptr %z) {
+; T2-LABEL: single_bit_multi_use:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: str r0, [r1]
+; T2-NEXT: lsls r0, r0, #23
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it pl
+; T2-NEXT: movpl r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: single_bit_multi_use:
+; T1: @ %bb.0:
+; T1-NEXT: str r0, [r1]
+; T1-NEXT: lsls r0, r0, #23
+; T1-NEXT: bmi .LBB1_2
+; T1-NEXT: @ %bb.1: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB1_2: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
store i32 %p, ptr %z
%a = and i32 %p, 256
%b = icmp eq i32 %a, 0
@@ -42,6 +81,24 @@ false:
; T2-NEXT: it
; T1-NEXT: beq
define i32 @multi_bit_lsb_ubfx(i32 %p) {
+; T2-LABEL: multi_bit_lsb_ubfx:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: lsls r0, r0, #24
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it eq
+; T2-NEXT: moveq r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: multi_bit_lsb_ubfx:
+; T1: @ %bb.0:
+; T1-NEXT: lsls r0, r0, #24
+; T1-NEXT: beq .LBB2_2
+; T1-NEXT: @ %bb.1: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB2_2: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
%a = and i32 %p, 255
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -59,6 +116,24 @@ false:
; T2-NEXT: it
; T1-NEXT: beq
define i32 @multi_bit_msb(i32 %p) {
+; T2-LABEL: multi_bit_msb:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: lsrs r0, r0, #24
+; T2-NEXT: mov.w r0, #2
+; T2-NEXT: it eq
+; T2-NEXT: moveq r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: multi_bit_msb:
+; T1: @ %bb.0:
+; T1-NEXT: lsrs r0, r0, #24
+; T1-NEXT: beq .LBB3_2
+; T1-NEXT: @ %bb.1: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB3_2: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
%a = and i32 %p, 4278190080 ; 0xff000000
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -77,6 +152,26 @@ false:
; T2-NEXT: it
; T1-NEXT: beq
define i32 @multi_bit_nosb(i32 %p) {
+; T2-LABEL: multi_bit_nosb:
+; T2: @ %bb.0: @ %common.ret
+; T2-NEXT: movs r1, #2
+; T2-NEXT: tst.w r0, #16711680
+; T2-NEXT: it eq
+; T2-NEXT: moveq r1, #1
+; T2-NEXT: mov r0, r1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: multi_bit_nosb:
+; T1: @ %bb.0:
+; T1-NEXT: lsls r0, r0, #8
+; T1-NEXT: lsrs r0, r0, #24
+; T1-NEXT: beq .LBB4_2
+; T1-NEXT: @ %bb.1: @ %false
+; T1-NEXT: movs r0, #2
+; T1-NEXT: bx lr
+; T1-NEXT: .LBB4_2: @ %true
+; T1-NEXT: movs r0, #1
+; T1-NEXT: bx lr
%a = and i32 %p, 16711680 ; 0x00ff0000
%b = icmp eq i32 %a, 0
br i1 %b, label %true, label %false
@@ -96,6 +191,28 @@ false:
; T2-NEXT: movs r2, #0
; T2-NEXT: cmp.w r2, r0, lsr #9
define void @i16_cmpz(i16 %x, ptr %foo) {
+; T2-LABEL: i16_cmpz:
+; T2: @ %bb.0: @ %entry
+; T2-NEXT: uxth r0, r0
+; T2-NEXT: movs r2, #0
+; T2-NEXT: cmp.w r2, r0, lsr #9
+; T2-NEXT: it ne
+; T2-NEXT: bxne lr
+; T2-NEXT: .LBB5_1: @ %if.then
+; T2-NEXT: movs r0, #0
+; T2-NEXT: bx r1
+;
+; T1-LABEL: i16_cmpz:
+; T1: @ %bb.0: @ %entry
+; T1-NEXT: push {r7, lr}
+; T1-NEXT: uxth r0, r0
+; T1-NEXT: lsrs r0, r0, #9
+; T1-NEXT: bne .LBB5_2
+; T1-NEXT: @ %bb.1: @ %if.then
+; T1-NEXT: movs r0, #0
+; T1-NEXT: blx r1
+; T1-NEXT: .LBB5_2: @ %if.end
+; T1-NEXT: pop {r7, pc}
entry:
%cmp = icmp ult i16 %x, 512
br i1 %cmp, label %if.then, label %if.end
@@ -107,3 +224,5 @@ if.then: ; preds = %entry
if.end: ; preds = %if.then, %entry
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/bfx.ll b/llvm/test/CodeGen/ARM/bfx.ll
index a585fc8be9ede1..fdde6be286b2bd 100644
--- a/llvm/test/CodeGen/ARM/bfx.ll
+++ b/llvm/test/CodeGen/ARM/bfx.ll
@@ -1,8 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=arm-eabi -mattr=+v7 %s -o - | FileCheck %s
define i32 @sbfx1(i32 %a) {
-; CHECK: sbfx1
-; CHECK: sbfx r0, r0, #7, #11
+; CHECK-LABEL: sbfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sbfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = sext i11 %t2 to i32
@@ -10,8 +13,10 @@ define i32 @sbfx1(i32 %a) {
}
define i32 @ubfx1(i32 %a) {
-; CHECK: ubfx1
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = zext i11 %t2 to i32
@@ -19,8 +24,10 @@ define i32 @ubfx1(i32 %a) {
}
define i32 @ubfx2(i32 %a) {
-; CHECK: ubfx2
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = and i32 %t1, 2047
ret i32 %t2
@@ -28,14 +35,18 @@ define i32 @ubfx2(i32 %a) {
; rdar://12870177
define i32 @ubfx_opt(ptr nocapture %ctx, i32 %x) nounwind readonly ssp {
+; CHECK-LABEL: ubfx_opt:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsr r2, r1, #24
+; CHECK-NEXT: ldr r2, [r0, r2, lsl #2]
+; CHECK-NEXT: ubfx r3, r1, #16, #8
+; CHECK-NEXT: ldr r3, [r0, r3, lsl #2]
+; CHECK-NEXT: ubfx r1, r1, #8, #8
+; CHECK-NEXT: ldr r0, [r0, r1, lsl #2]
+; CHECK-NEXT: add r2, r3, r2
+; CHECK-NEXT: add r0, r2, r0
+; CHECK-NEXT: bx lr
entry:
-; CHECK: ubfx_opt
-; CHECK: lsr [[REG1:(lr|r[0-9]+)]], r1, #24
-; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG1]], lsl #2]
-; CHECK: ubfx [[REG2:(lr|r[0-9]+)]], r1, #16, #8
-; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG2]], lsl #2]
-; CHECK: ubfx [[REG3:(lr|r[0-9]+)]], r1, #8, #8
-; CHECK: ldr {{lr|r[0-9]+}}, [r0, [[REG3]], lsl #2]
%and = lshr i32 %x, 8
%shr = and i32 %and, 255
%and1 = lshr i32 %x, 16
@@ -53,16 +64,20 @@ entry:
}
define i32 @ubfx3(i32 %a) {
-; CHECK: ubfx3
-; CHECK: ubfx r0, r0, #11, #1
+; CHECK-LABEL: ubfx3:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #11, #1
+; CHECK-NEXT: bx lr
%t1 = and i32 %a, 2048
%t2 = lshr i32 %t1, 11
ret i32 %t2
}
define i32 @ubfx4(i32 %a) {
-; CHECK: ubfx4
-; CHECK: ubfx r0, r0, #7, #3
+; CHECK-LABEL: ubfx4:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #3
+; CHECK-NEXT: bx lr
%t1 = and i32 %a, 896
%t2 = lshr i32 %t1, 7
ret i32 %t2
diff --git a/llvm/test/CodeGen/ARM/sbfx.ll b/llvm/test/CodeGen/ARM/sbfx.ll
index 5b77c59bca967a..72e9b5b1c9c425 100644
--- a/llvm/test/CodeGen/ARM/sbfx.ll
+++ b/llvm/test/CodeGen/ARM/sbfx.ll
@@ -1,46 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=arm-eabi -mattr=+v6t2 %s -o - | FileCheck %s
define i32 @f1(i32 %a) {
-entry:
; CHECK-LABEL: f1:
-; CHECK: sbfx r0, r0, #0, #20
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sbfx r0, r0, #0, #20
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 12
%tmp2 = ashr i32 %tmp, 12
ret i32 %tmp2
}
define i32 @f2(i32 %a) {
-entry:
; CHECK-LABEL: f2:
-; CHECK: bfc r0, #20, #12
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bfc r0, #20, #12
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 12
%tmp2 = lshr i32 %tmp, 12
ret i32 %tmp2
}
define i32 @f3(i32 %a) {
-entry:
; CHECK-LABEL: f3:
-; CHECK: sbfx r0, r0, #5, #3
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sbfx r0, r0, #5, #3
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 24
%tmp2 = ashr i32 %tmp, 29
ret i32 %tmp2
}
define i32 @f4(i32 %a) {
-entry:
; CHECK-LABEL: f4:
-; CHECK: ubfx r0, r0, #5, #3
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ubfx r0, r0, #5, #3
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 24
%tmp2 = lshr i32 %tmp, 29
ret i32 %tmp2
}
define i32 @f5(i32 %a) {
-entry:
; CHECK-LABEL: f5:
-; CHECK-NOT: sbfx
-; CHECK: bx
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsl r0, r0, #3
+; CHECK-NEXT: asr r0, r0, #1
+; CHECK-NEXT: bx lr
+entry:
%tmp = shl i32 %a, 3
%tmp2 = ashr i32 %tmp, 1
ret i32 %tmp2
@@ -48,7 +59,9 @@ entry:
define signext i8 @f6(i32 %a) {
; CHECK-LABEL: f6:
-; CHECK: sbfx r0, r0, #23, #8
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sbfx r0, r0, #23, #8
+; CHECK-NEXT: bx lr
%tmp = lshr i32 %a, 23
%res = trunc i32 %tmp to i8
@@ -57,7 +70,9 @@ define signext i8 @f6(i32 %a) {
define signext i8 @f7(i32 %a) {
; CHECK-LABEL: f7:
-; CHECK-NOT: sbfx
+; CHECK: @ %bb.0:
+; CHECK-NEXT: lsr r0, r0, #25
+; CHECK-NEXT: bx lr
%tmp = lshr i32 %a, 25
%res = trunc i32 %tmp to i8
diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
index a9eda31e729e2c..58a5bf1bda71da 100644
--- a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
+++ b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
@@ -1,13 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV
; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV
; Check SREM
define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: test_rem
-; CHECK: asr r1, r0, #31
-; CHECK-NEXT: add r1, r0, r1, lsr #30
-; CHECK-NEXT: bic r1, r1, #3
-; CHECK-NEXT: sub r0, r0, r1
+; CHECK-LABEL: test_rem:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: asr r1, r0, #31
+; CHECK-NEXT: add r1, r0, r1, lsr #30
+; CHECK-NEXT: bic r1, r1, #3
+; CHECK-NEXT: sub r0, r0, r1
+; CHECK-NEXT: bx lr
entry:
%div = srem i32 %F, 4
@@ -16,18 +19,22 @@ entry:
; Try an i16 sdiv, with a small immediate.
define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f0
+; DIV-LABEL: f0:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #2
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: sxth r0, r0
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f0:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: uxth r1, r0
+; NODIV-NEXT: add r0, r0, r1, lsr #15
+; NODIV-NEXT: sxth r0, r0
+; NODIV-NEXT: asr r0, r0, #1
+; NODIV-NEXT: bx lr
-; DIV: mov r1, #2
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: sxth r0, r0
-; DIV-NEXT: bx lr
-; NODIV: uxth r1, r0
-; NODIV-NEXT: add r0, r0, r1, lsr #15
-; NODIV-NEXT: sxth r0, r0
-; NODIV-NEXT: asr r0, r0, #1
-; NODIV-NEXT: bx lr
entry:
%0 = sdiv i16 %F, 2
@@ -36,16 +43,20 @@ entry:
; Try an i32 sdiv, with a small immediate.
define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f1
+; DIV-LABEL: f1:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #4
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f1:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: asr r1, r0, #31
+; NODIV-NEXT: add r0, r0, r1, lsr #30
+; NODIV-NEXT: asr r0, r0, #2
+; NODIV-NEXT: bx lr
-; DIV: mov r1, #4
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: bx lr
-; NODIV: asr r1, r0, #31
-; NODIV-NEXT: add r0, r0, r1, lsr #30
-; NODIV-NEXT: asr r0, r0, #2
-; NODIV-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
@@ -55,10 +66,18 @@ entry:
; Try a large power of 2 immediate, which should also be materialised with 1
; move immediate instruction.
define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f2
-; DIV: mov r1, #131072
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: bx lr
+; DIV-LABEL: f2:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #131072
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f2:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: asr r1, r0, #31
+; NODIV-NEXT: add r0, r0, r1, lsr #15
+; NODIV-NEXT: asr r0, r0, #17
+; NODIV-NEXT: bx lr
entry:
%div = sdiv i32 %F, 131072
ret i32 %div
@@ -66,11 +85,12 @@ entry:
; MinSize not set, so should expand to the faster but longer sequence.
define dso_local i32 @f3(i32 %F) {
-; CHECK-LABEL: f3
-; CHECK: asr r1, r0, #31
-; CHECK-NEXT: add r0, r0, r1, lsr #30
-; CHECK-NEXT: asr r0, r0, #2
-; CHECK-NEXT: bx lr
+; CHECK-LABEL: f3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: asr r1, r0, #31
+; CHECK-NEXT: add r0, r0, r1, lsr #30
+; CHECK-NEXT: asr r0, r0, #2
+; CHECK-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
ret i32 %div
diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
index 66417cddd4d566..196d9340a7ce59 100644
--- a/llvm/test/CodeGen/ARM/shift-combine.ll
+++ b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -1240,6 +1240,67 @@ define <4 x i32> @or_tree_with_shifts_vec_i32(<4 x i32> %a, <4 x i32> %b, <4 x i
; CHECK-BE-NEXT: vorr q8, q8, q10
; CHECK-BE-NEXT: vrev64.32 q0, q8
; CHECK-BE-NEXT: bx lr
+;
+; CHECK-ALIGN-LABEL: or_tree_with_shifts_vec_i32:
+; CHECK-ALIGN: @ %bb.0:
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #16]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r0
+; CHECK-ALIGN-NEXT: ldr r0, [sp]
+; CHECK-ALIGN-NEXT: orr.w r12, r0, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r0, [sp, #32]
+; CHECK-ALIGN-NEXT: orr.w r0, r0, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #20]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r1
+; CHECK-ALIGN-NEXT: ldr r1, [sp, #4]
+; CHECK-ALIGN-NEXT: orr.w r12, r1, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r1, [sp, #36]
+; CHECK-ALIGN-NEXT: orr.w r1, r1, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #24]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r2
+; CHECK-ALIGN-NEXT: ldr r2, [sp, #8]
+; CHECK-ALIGN-NEXT: orr.w r12, r2, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r2, [sp, #40]
+; CHECK-ALIGN-NEXT: orr.w r2, r2, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #28]
+; CHECK-ALIGN-NEXT: orr.w r12, r12, r3
+; CHECK-ALIGN-NEXT: ldr r3, [sp, #12]
+; CHECK-ALIGN-NEXT: orr.w r12, r3, r12, lsl #16
+; CHECK-ALIGN-NEXT: ldr r3, [sp, #44]
+; CHECK-ALIGN-NEXT: orr.w r3, r3, r12
+; CHECK-ALIGN-NEXT: bx lr
+;
+; CHECK-V6M-LABEL: or_tree_with_shifts_vec_i32:
+; CHECK-V6M: @ %bb.0:
+; CHECK-V6M-NEXT: push {r4, lr}
+; CHECK-V6M-NEXT: ldr r4, [sp, #24]
+; CHECK-V6M-NEXT: orrs r4, r0
+; CHECK-V6M-NEXT: lsls r0, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #8]
+; CHECK-V6M-NEXT: orrs r4, r0
+; CHECK-V6M-NEXT: ldr r0, [sp, #40]
+; CHECK-V6M-NEXT: orrs r0, r4
+; CHECK-V6M-NEXT: ldr r4, [sp, #28]
+; CHECK-V6M-NEXT: orrs r4, r1
+; CHECK-V6M-NEXT: lsls r1, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #12]
+; CHECK-V6M-NEXT: orrs r4, r1
+; CHECK-V6M-NEXT: ldr r1, [sp, #44]
+; CHECK-V6M-NEXT: orrs r1, r4
+; CHECK-V6M-NEXT: ldr r4, [sp, #32]
+; CHECK-V6M-NEXT: orrs r4, r2
+; CHECK-V6M-NEXT: lsls r2, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #16]
+; CHECK-V6M-NEXT: orrs r4, r2
+; CHECK-V6M-NEXT: ldr r2, [sp, #48]
+; CHECK-V6M-NEXT: orrs r2, r4
+; CHECK-V6M-NEXT: ldr r4, [sp, #36]
+; CHECK-V6M-NEXT: orrs r4, r3
+; CHECK-V6M-NEXT: lsls r3, r4, #16
+; CHECK-V6M-NEXT: ldr r4, [sp, #20]
+; CHECK-V6M-NEXT: orrs r4, r3
+; CHECK-V6M-NEXT: ldr r3, [sp, #52]
+; CHECK-V6M-NEXT: orrs r3, r4
+; CHECK-V6M-NEXT: pop {r4, pc}
%a.shifted = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
%c.shifted = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
%or.ab = or <4 x i32> %a.shifted, %b
@@ -1271,6 +1332,72 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
; CHECK-BE-NEXT: vorr q8, q9, q8
; CHECK-BE-NEXT: vrev64.32 q0, q8
; CHECK-BE-NEXT: bx lr
+;
+; CHECK-ALIGN-LABEL: or_tree_with_mismatching_shifts_vec_i32:
+; CHECK-ALIGN: @ %bb.0:
+; CHECK-ALIGN-NEXT: push {r7, lr}
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #24]
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #40]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #8]
+; CHECK-ALIGN-NEXT: orr.w r0, lr, r0, lsl #16
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #44]
+; CHECK-ALIGN-NEXT: orr.w r0, r0, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #28]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #12]
+; CHECK-ALIGN-NEXT: orr.w r1, lr, r1, lsl #16
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #48]
+; CHECK-ALIGN-NEXT: orr.w r1, r1, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #32]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #16]
+; CHECK-ALIGN-NEXT: orr.w r2, lr, r2, lsl #16
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #52]
+; CHECK-ALIGN-NEXT: orr.w r2, r2, r12
+; CHECK-ALIGN-NEXT: ldr.w r12, [sp, #36]
+; CHECK-ALIGN-NEXT: orr.w r12, lr, r12, lsl #17
+; CHECK-ALIGN-NEXT: ldr.w lr, [sp, #20]
+; CHECK-ALIGN-NEXT: orr.w r3, lr, r3, lsl #16
+; CHECK-ALIGN-NEXT: orr.w r3, r3, r12
+; CHECK-ALIGN-NEXT: pop {r7, pc}
+;
+; CHECK-V6M-LABEL: or_tree_with_mismatching_shifts_vec_i32:
+; CHECK-V6M: @ %bb.0:
+; CHECK-V6M-NEXT: push {r4, r5, r7, lr}
+; CHECK-V6M-NEXT: ldr r4, [sp, #32]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #48]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r0, #16
+; CHECK-V6M-NEXT: ldr r0, [sp, #16]
+; CHECK-V6M-NEXT: orrs r0, r4
+; CHECK-V6M-NEXT: orrs r0, r5
+; CHECK-V6M-NEXT: ldr r4, [sp, #36]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #52]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r1, #16
+; CHECK-V6M-NEXT: ldr r1, [sp, #20]
+; CHECK-V6M-NEXT: orrs r1, r4
+; CHECK-V6M-NEXT: orrs r1, r5
+; CHECK-V6M-NEXT: ldr r4, [sp, #40]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #56]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r2, #16
+; CHECK-V6M-NEXT: ldr r2, [sp, #24]
+; CHECK-V6M-NEXT: orrs r2, r4
+; CHECK-V6M-NEXT: orrs r2, r5
+; CHECK-V6M-NEXT: ldr r4, [sp, #44]
+; CHECK-V6M-NEXT: lsls r4, r4, #17
+; CHECK-V6M-NEXT: ldr r5, [sp, #60]
+; CHECK-V6M-NEXT: orrs r5, r4
+; CHECK-V6M-NEXT: lsls r4, r3, #16
+; CHECK-V6M-NEXT: ldr r3, [sp, #28]
+; CHECK-V6M-NEXT: orrs r3, r4
+; CHECK-V6M-NEXT: orrs r3, r5
+; CHECK-V6M-NEXT: pop {r4, r5, r7, pc}
%a.shifted = shl <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
%c.shifted = shl <4 x i32> %c, <i32 17, i32 17, i32 17, i32 17>
%or.ab = or <4 x i32> %a.shifted, %b
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_9.ll b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
index 3b9293d38fd01f..7656943ad3049a 100644
--- a/llvm/test/CodeGen/BPF/remove_truncate_9.ll
+++ b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mcpu=v2 -march=bpf < %s | FileCheck %s
; RUN: llc -mcpu=v4 -march=bpf < %s | FileCheck %s
@@ -79,3 +80,5 @@ declare void @sink1(i8, i64, i64, i64, i1);
declare void @sink2(i16, i64, i64, i64, i1);
declare void @sink3(i8, i1);
declare void @sink4(i32, i1);
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/Mips/cins.ll b/llvm/test/CodeGen/Mips/cins.ll
index 4fe25564d1c12d..d00138a3ce37a7 100644
--- a/llvm/test/CodeGen/Mips/cins.ll
+++ b/llvm/test/CodeGen/Mips/cins.ll
@@ -1,92 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -march=mips64 -mcpu=octeon -target-abi=n64 < %s -o - | FileCheck %s
define i64 @cins_zext(i32 signext %n) {
+; CHECK-LABEL: cins_zext:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 5, 26
entry:
%shl = shl i32 %n, 5
%conv = zext i32 %shl to i64
ret i64 %conv
-; CHECK-LABEL: cins_zext:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 5, 26
}
define i64 @cins_and_shl(i64 zeroext %n) {
+; CHECK-LABEL: cins_and_shl:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 8, 15
entry:
%and = shl i64 %n, 8
%shl = and i64 %and, 16776960
ret i64 %shl
-; CHECK-LABEL: cins_and_shl:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 8, 15
}
define i64 @cins_and_shl32(i64 zeroext %n) {
+; CHECK-LABEL: cins_and_shl32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins32 $2, $4, 6, 15
entry:
%and = shl i64 %n, 38
%shl = and i64 %and, 18014123631575040
ret i64 %shl
-; CHECK-LABEL: cins_and_shl32:
-; CHECK: cins32 $[[R0:[0-9]+]], $[[R1:[0-9]+]], 6, 15
}
define zeroext i16 @cins_and_shl_16(i16 zeroext %n) {
+; CHECK-LABEL: cins_and_shl_16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 2, 3
entry:
%0 = shl i16 %n, 2
%1 = and i16 %0, 60
ret i16 %1
-; CHECK-LABEL: cins_and_shl_16:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 3
}
define zeroext i8 @cins_and_shl_8(i8 zeroext %n) {
+; CHECK-LABEL: cins_and_shl_8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 2, 1
entry:
%0 = shl i8 %n, 2
%1 = and i8 %0, 12
ret i8 %1
-; CHECK-LABEL: cins_and_shl_8:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 1
}
define i32 @cins_i32(i32 signext %a) {
+; CHECK-LABEL: cins_i32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 17, 11
entry:
%and = shl i32 %a, 17
%shl = and i32 %and, 536739840
ret i32 %shl
-; CHECK-LABEL: cins_i32:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 17, 11
}
define i64 @cins_shl_and(i32 signext %n) {
+; CHECK-LABEL: cins_shl_and:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins $2, $4, 31, 15
entry:
%and = and i32 %n, 65535
%conv = zext i32 %and to i64
%shl = shl nuw nsw i64 %conv, 31
ret i64 %shl
-; CHECK-LABEL: cins_shl_and:
-; CHECK: cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 31, 15
}
define i64 @cins_shl_and32(i32 signext %n) {
+; CHECK-LABEL: cins_shl_and32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jr $ra
+; CHECK-NEXT: cins32 $2, $4, 15, 15
entry:
%and = and i32 %n, 65535
%conv = zext i32 %and to i64
%shl = shl nuw nsw i64 %conv, 47
ret i64 %shl
-; CHECK-LABEL: cins_shl_and32:
-; CHECK: cins32 $[[R0:[0-9]+]], $[[R1:[0-9]+]], 15, 15
}
diff --git a/llvm/test/CodeGen/Mips/fabs.ll b/llvm/test/CodeGen/Mips/fabs.ll
index 75aa7d8295f94e..e596d93509feeb 100644
--- a/llvm/test/CodeGen/Mips/fabs.ll
+++ b/llvm/test/CodeGen/Mips/fabs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Check that abs.[ds] is only selected for mips32r6 or mips64r6 when no
; additional options are passed. For revisions prior mips32r6 and mips64r6,
; abs.[ds] does not generate the correct result when working with NaNs, and
@@ -73,12 +74,13 @@
; RUN: -enable-no-nans-fp-math | FileCheck %s -check-prefix=CHECK-ABS2008
define float @foo0(float %a) nounwind readnone {
+; CHECK-ABS2008-LABEL: foo0:
+; CHECK-ABS2008: # %bb.0: # %entry
+; CHECK-ABS2008-NEXT: jr $ra
+; CHECK-ABS2008-NEXT: abs.s $f0, $f12
entry:
; CHECK-LABEL: foo0
-; CHECK-ABS2008: abs.s
-; CHECK-ABSLEGACY: {{(ori|ins)}}
-; CHECK-ABSLEGACY-NOT: abs.s
%call = tail call float @fabsf(float %a) nounwind readnone
ret float %call
@@ -87,15 +89,18 @@ entry:
declare float @fabsf(float) nounwind readnone
define double @foo1(double %a) nounwind readnone {
+; CHECK-ABS2008-LABEL: foo1:
+; CHECK-ABS2008: # %bb.0: # %entry
+; CHECK-ABS2008-NEXT: jr $ra
+; CHECK-ABS2008-NEXT: abs.d $f0, $f12
entry:
; CHECK-LABEL: foo1:
-; CHECK-ABS2008: abs.d
-; CHECK-ABSLEGACY: {{(ori|ins|dsll)}}
-; CHECK-ABSLEGACY-NOT: abs.d
%call = tail call double @fabs(double %a) nounwind readnone
ret double %call
}
declare double @fabs(double) nounwind readnone
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-ABSLEGACY: {{.*}}
diff --git a/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll b/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
index 695431a5ab6074..cc2c674f89586b 100644
--- a/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
+++ b/llvm/test/CodeGen/Mips/fcopysign-f32-f64.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -verify-machineinstrs -march=mips64el -mcpu=mips4 \
; RUN: -target-abi=n64 | FileCheck %s -check-prefixes=ALL,64
; RUN: llc < %s -verify-machineinstrs -march=mips64el -mcpu=mips64 \
@@ -10,21 +11,46 @@ declare double @copysign(double, double) nounwind readnone
declare float @copysignf(float, float) nounwind readnone
define float @func2(float %d, double %f) nounwind readnone {
+; 64-LABEL: func2:
+; 64: # %bb.0: # %entry
+; 64-NEXT: lui $1, %highest(.LCPI0_0)
+; 64-NEXT: daddiu $1, $1, %higher(.LCPI0_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: daddiu $1, $1, %hi(.LCPI0_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: lwc1 $f0, %lo(.LCPI0_0)($1)
+; 64-NEXT: add.s $f0, $f12, $f0
+; 64-NEXT: mfc1 $1, $f0
+; 64-NEXT: dmfc1 $2, $f13
+; 64-NEXT: lui $3, 32767
+; 64-NEXT: ori $3, $3, 65535
+; 64-NEXT: and $1, $1, $3
+; 64-NEXT: dsrl $2, $2, 63
+; 64-NEXT: sll $2, $2, 0
+; 64-NEXT: sll $2, $2, 31
+; 64-NEXT: or $1, $1, $2
+; 64-NEXT: jr $ra
+; 64-NEXT: mtc1 $1, $f0
+;
+; 64R2-LABEL: func2:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: lui $1, %highest(.LCPI0_0)
+; 64R2-NEXT: daddiu $1, $1, %higher(.LCPI0_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: daddiu $1, $1, %hi(.LCPI0_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: lwc1 $f0, %lo(.LCPI0_0)($1)
+; 64R2-NEXT: add.s $f0, $f12, $f0
+; 64R2-NEXT: mfc1 $1, $f0
+; 64R2-NEXT: dmfc1 $2, $f13
+; 64R2-NEXT: dextu $2, $2, 63, 1
+; 64R2-NEXT: sll $2, $2, 0
+; 64R2-NEXT: ins $1, $2, 31, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: mtc1 $1, $f0
entry:
-; ALL-LABEL: func2:
-; 64-DAG: lui $[[T0:[0-9]+]], 32767
-; 64-DAG: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 64-DAG: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64-DAG: dsrl $[[DSRL:[0-9]+]], ${{[0-9]+}}, 63
-; 64-DAG: sll $[[SLL0:[0-9]+]], $[[DSRL]], 0
-; 64-DAG: sll $[[SLL1:[0-9]+]], $[[SLL0]], 31
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[SLL1]]
-; 64: mtc1 $[[OR]], $f0
-; 64R2: dextu ${{[0-9]+}}, ${{[0-9]+}}, 63, 1
-; 64R2: ins $[[INS:[0-9]+]], ${{[0-9]+}}, 31, 1
-; 64R2: mtc1 $[[INS]], $f0
%add = fadd float %d, 1.000000e+00
%conv = fptrunc double %f to float
@@ -33,26 +59,51 @@ entry:
}
define double @func3(double %d, float %f) nounwind readnone {
+; 64-LABEL: func3:
+; 64: # %bb.0: # %entry
+; 64-NEXT: lui $1, %highest(.LCPI1_0)
+; 64-NEXT: daddiu $1, $1, %higher(.LCPI1_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: daddiu $1, $1, %hi(.LCPI1_0)
+; 64-NEXT: dsll $1, $1, 16
+; 64-NEXT: ldc1 $f0, %lo(.LCPI1_0)($1)
+; 64-NEXT: add.d $f0, $f12, $f0
+; 64-NEXT: mfc1 $1, $f13
+; 64-NEXT: daddiu $2, $zero, 1
+; 64-NEXT: dmfc1 $3, $f0
+; 64-NEXT: dsll $2, $2, 63
+; 64-NEXT: daddiu $2, $2, -1
+; 64-NEXT: and $2, $3, $2
+; 64-NEXT: srl $1, $1, 31
+; 64-NEXT: dsll $1, $1, 63
+; 64-NEXT: or $1, $2, $1
+; 64-NEXT: jr $ra
+; 64-NEXT: dmtc1 $1, $f0
+;
+; 64R2-LABEL: func3:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: lui $1, %highest(.LCPI1_0)
+; 64R2-NEXT: daddiu $1, $1, %higher(.LCPI1_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: daddiu $1, $1, %hi(.LCPI1_0)
+; 64R2-NEXT: dsll $1, $1, 16
+; 64R2-NEXT: ldc1 $f0, %lo(.LCPI1_0)($1)
+; 64R2-NEXT: add.d $f0, $f12, $f0
+; 64R2-NEXT: dmfc1 $1, $f0
+; 64R2-NEXT: mfc1 $2, $f13
+; 64R2-NEXT: ext $2, $2, 31, 1
+; 64R2-NEXT: dext $2, $2, 0, 32
+; 64R2-NEXT: dinsu $1, $2, 63, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: dmtc1 $1, $f0
entry:
-; ALL-LABEL: func3:
-
-; 64: mfc1 $[[MFC:[0-9]+]], $f13
-; 64: daddiu $[[R1:[0-9]+]], $zero, 1
-; 64: dmfc1 $[[R0:[0-9]+]], ${{.*}}
-; 64: dsll $[[R2:[0-9]+]], $[[R1]], 63
-; 64: daddiu $[[R3:[0-9]+]], $[[R2]], -1
-; 64: and $[[AND0:[0-9]+]], $[[R0]], $[[R3]]
-; 64: srl $[[SRL:[0-9]+]], $[[MFC:[0-9]+]], 31
-; 64: dsll $[[DSLL:[0-9]+]], $[[SRL]], 63
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[DSLL]]
-; 64: dmtc1 $[[OR]], $f0
-
-; 64R2: ext ${{[0-9]+}}, ${{[0-9]+}}, 31, 1
-; 64R2: dinsu $[[INS:[0-9]+]], ${{[0-9]+}}, 63, 1
-; 64R2: dmtc1 $[[INS]], $f0
+
+
%add = fadd double %d, 1.000000e+00
%conv = fpext float %f to double
%call = tail call double @copysign(double %add, double %conv) nounwind readnone
ret double %call
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
diff --git a/llvm/test/CodeGen/Mips/fcopysign.ll b/llvm/test/CodeGen/Mips/fcopysign.ll
index 810d0f9580861c..167354aaf085a4 100644
--- a/llvm/test/CodeGen/Mips/fcopysign.ll
+++ b/llvm/test/CodeGen/Mips/fcopysign.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -verify-machineinstrs -march=mipsel -mcpu=mips32 \
; RUN: | FileCheck %s -check-prefix=32
; RUN: llc < %s -verify-machineinstrs -march=mipsel -mcpu=mips32r2 \
@@ -10,31 +11,58 @@
; RUN: | FileCheck %s -check-prefix=64R2
define double @func0(double %d0, double %d1) nounwind readnone {
-entry:
+; 32-LABEL: func0:
+; 32: # %bb.0: # %entry
+; 32-NEXT: mfc1 $1, $f15
+; 32-NEXT: lui $2, 32768
+; 32-NEXT: and $1, $1, $2
+; 32-NEXT: lui $2, 32767
+; 32-NEXT: ori $2, $2, 65535
+; 32-NEXT: mfc1 $3, $f13
+; 32-NEXT: and $2, $3, $2
+; 32-NEXT: or $1, $2, $1
+; 32-NEXT: mfc1 $2, $f12
+; 32-NEXT: mtc1 $2, $f0
+; 32-NEXT: jr $ra
+; 32-NEXT: mtc1 $1, $f1
+;
+; 32R2-LABEL: func0:
+; 32R2: # %bb.0: # %entry
+; 32R2-NEXT: mfhc1 $1, $f12
+; 32R2-NEXT: mfhc1 $2, $f14
+; 32R2-NEXT: ext $2, $2, 31, 1
+; 32R2-NEXT: ins $1, $2, 31, 1
+; 32R2-NEXT: mfc1 $2, $f12
+; 32R2-NEXT: mtc1 $2, $f0
+; 32R2-NEXT: mthc1 $1, $f0
+; 32R2-NEXT: jr $ra
+; 32R2-NEXT: nop
;
-; 32: lui $[[MSK1:[0-9]+]], 32768
-; 32: and $[[AND1:[0-9]+]], ${{[0-9]+}}, $[[MSK1]]
-; 32: lui $[[T0:[0-9]+]], 32767
-; 32: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
-; 32: mtc1 $[[OR]], $f1
+; 64-LABEL: func0:
+; 64: # %bb.0: # %entry
+; 64-NEXT: daddiu $1, $zero, 1
+; 64-NEXT: dsll $1, $1, 63
+; 64-NEXT: dmfc1 $2, $f13
+; 64-NEXT: and $2, $2, $1
+; 64-NEXT: dmfc1 $3, $f12
+; 64-NEXT: daddiu $1, $1, -1
+; 64-NEXT: and $1, $3, $1
+; 64-NEXT: or $1, $1, $2
+; 64-NEXT: jr $ra
+; 64-NEXT: dmtc1 $1, $f0
+;
+; 64R2-LABEL: func0:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: dmfc1 $1, $f12
+; 64R2-NEXT: dmfc1 $2, $f13
+; 64R2-NEXT: dextu $2, $2, 63, 1
+; 64R2-NEXT: dinsu $1, $2, 63, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: dmtc1 $1, $f0
+entry:
-; 32R2: ext $[[EXT:[0-9]+]], ${{[0-9]+}}, 31, 1
-; 32R2: ins $[[INS:[0-9]+]], $[[EXT]], 31, 1
-; 32R2: mthc1 $[[INS]], $f0
-; 64: daddiu $[[T0:[0-9]+]], $zero, 1
-; 64: dsll $[[MSK1:[0-9]+]], $[[T0]], 63
-; 64: and $[[AND1:[0-9]+]], ${{[0-9]+}}, $[[MSK1]]
-; 64: daddiu $[[MSK0:[0-9]+]], $[[MSK1]], -1
-; 64: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 64: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
-; 64: dmtc1 $[[OR]], $f0
-; 64R2: dextu $[[EXT:[0-9]+]], ${{[0-9]+}}, 63, 1
-; 64R2: dinsu $[[INS:[0-9]+]], $[[EXT]], 63, 1
-; 64R2: dmtc1 $[[INS]], $f0
%call = tail call double @copysign(double %d0, double %d1) nounwind readnone
ret double %call
@@ -43,19 +71,52 @@ entry:
declare double @copysign(double, double) nounwind readnone
define float @func1(float %f0, float %f1) nounwind readnone {
+; 32-LABEL: func1:
+; 32: # %bb.0: # %entry
+; 32-NEXT: mfc1 $1, $f14
+; 32-NEXT: lui $2, 32768
+; 32-NEXT: and $1, $1, $2
+; 32-NEXT: lui $2, 32767
+; 32-NEXT: ori $2, $2, 65535
+; 32-NEXT: mfc1 $3, $f12
+; 32-NEXT: and $2, $3, $2
+; 32-NEXT: or $1, $2, $1
+; 32-NEXT: jr $ra
+; 32-NEXT: mtc1 $1, $f0
+;
+; 32R2-LABEL: func1:
+; 32R2: # %bb.0: # %entry
+; 32R2-NEXT: mfc1 $1, $f12
+; 32R2-NEXT: mfc1 $2, $f14
+; 32R2-NEXT: ext $2, $2, 31, 1
+; 32R2-NEXT: ins $1, $2, 31, 1
+; 32R2-NEXT: jr $ra
+; 32R2-NEXT: mtc1 $1, $f0
+;
+; 64-LABEL: func1:
+; 64: # %bb.0: # %entry
+; 64-NEXT: mfc1 $1, $f13
+; 64-NEXT: lui $2, 32768
+; 64-NEXT: and $1, $1, $2
+; 64-NEXT: lui $2, 32767
+; 64-NEXT: ori $2, $2, 65535
+; 64-NEXT: mfc1 $3, $f12
+; 64-NEXT: and $2, $3, $2
+; 64-NEXT: or $1, $2, $1
+; 64-NEXT: jr $ra
+; 64-NEXT: mtc1 $1, $f0
+;
+; 64R2-LABEL: func1:
+; 64R2: # %bb.0: # %entry
+; 64R2-NEXT: mfc1 $1, $f12
+; 64R2-NEXT: mfc1 $2, $f13
+; 64R2-NEXT: ext $2, $2, 31, 1
+; 64R2-NEXT: ins $1, $2, 31, 1
+; 64R2-NEXT: jr $ra
+; 64R2-NEXT: mtc1 $1, $f0
entry:
-; 32: lui $[[MSK1:[0-9]+]], 32768
-; 32: and $[[AND1:[0-9]+]], ${{[0-9]+}}, $[[MSK1]]
-; 32: lui $[[T0:[0-9]+]], 32767
-; 32: ori $[[MSK0:[0-9]+]], $[[T0]], 65535
-; 32: and $[[AND0:[0-9]+]], ${{[0-9]+}}, $[[MSK0]]
-; 32: or $[[OR:[0-9]+]], $[[AND0]], $[[AND1]]
-; 32: mtc1 $[[OR]], $f0
-; 32R2: ext $[[EXT:[0-9]+]], ${{[0-9]+}}, 31, 1
-; 32R2: ins $[[INS:[0-9]+]], $[[EXT]], 31, 1
-; 32R2: mtc1 $[[INS]], $f0
%call = tail call float @copysignf(float %f0, float %f1) nounwind readnone
ret float %call
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/abs.ll b/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
index c0812977e3a11b..ea0e34fb2b0856 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/abs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -march=mips -mcpu=mips32 -asm-show-inst < %s | FileCheck %s --check-prefix=MIPS32
; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+abs2008,+fp64 -asm-show-inst < %s | FileCheck %s --check-prefix=MIPS32FP64
; RUN: llc -march=mips -mcpu=mips32r3 -mattr=+abs2008,+micromips -asm-show-inst < %s | FileCheck %s --check-prefix=MM
@@ -5,23 +6,123 @@
; RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips -asm-show-inst < %s | FileCheck %s --check-prefix=MMR6
define float @abs_s(float %a) {
-; MIPS32: {{(ori|ins)}}
-; MIPS32-NOT: abs.s
-; MIPS32FP64: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S
-; MM: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S_MM
-; MMFP64: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S_MM
-; MMR6: abs.s {{.*}} # <MCInst #{{[0-9]+}} FABS_S_MM
+; MIPS32-LABEL: abs_s:
+; MIPS32: # %bb.0:
+; MIPS32-NEXT: lui $1, 32767 # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32-NEXT: # <MCOperand Imm:32767>>
+; MIPS32-NEXT: ori $1, $1, 65535 # <MCInst #[[#MCINST2:]] ORi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:65535>>
+; MIPS32-NEXT: mfc1 $2, $f12 # <MCInst #[[#MCINST3:]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MIPS32-NEXT: and $1, $2, $1 # <MCInst #[[#MCINST4:]] AND
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST5:]] JR
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MIPS32-NEXT: mtc1 $1, $f0 # <MCInst #[[#MCINST6:]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+;
+; MIPS32FP64-LABEL: abs_s:
+; MIPS32FP64: # %bb.0:
+; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST5:]] JR
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MIPS32FP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST7:]] FABS_S
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+;
+; MM-LABEL: abs_s:
+; MM: # %bb.0:
+; MM-NEXT: jr $ra # <MCInst #[[#MCINST8:]] JR_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MM-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MM-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+;
+; MMFP64-LABEL: abs_s:
+; MMFP64: # %bb.0:
+; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST8:]] JR_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
+; MMFP64-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+;
+; MMR6-LABEL: abs_s:
+; MMR6: # %bb.0:
+; MMR6-NEXT: abs.s $f0, $f12 # <MCInst #[[#MCINST9:]] FABS_S_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG5:]]>
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG3:]]>>
+; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST10:]] JRC16_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4:]]>>
%ret = call float @llvm.fabs.f32(float %a)
ret float %ret
}
define double @abs_d(double %a) {
-; MIPS32: {{(ori|ins|dsll)}}
-; MIPS32-NOT: abs.d
-; MIPS32FP64: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D64
-; MM: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D32_MM
-; MMFP64: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D64_MM
-; MMR6: abs.d {{.*}} # <MCInst #{{[0-9]+}} FABS_D64_MM
+; MIPS32-LABEL: abs_d:
+; MIPS32: # %bb.0:
+; MIPS32-NEXT: lui $1, 32767 # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:32767>>
+; MIPS32-NEXT: ori $1, $1, 65535 # <MCInst #[[#MCINST2]] ORi
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Imm:65535>>
+; MIPS32-NEXT: mfc1 $2, $f13 # <MCInst #[[#MCINST3]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS32-NEXT: and $1, $2, $1 # <MCInst #[[#MCINST4]] AND
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+; MIPS32-NEXT: mfc1 $2, $f12 # <MCInst #[[#MCINST3]] MFC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG3]]>>
+; MIPS32-NEXT: mtc1 $2, $f0 # <MCInst #[[#MCINST6]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG5]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT: jr $ra # <MCInst #[[#MCINST5]] JR
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MIPS32-NEXT: mtc1 $1, $f1 # <MCInst #[[#MCINST6]] MTC1
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS32-NEXT: # <MCOperand Reg:[[#MCREG1]]>>
+;
+; MIPS32FP64-LABEL: abs_d:
+; MIPS32FP64: # %bb.0:
+; MIPS32FP64-NEXT: jr $ra # <MCInst #[[#MCINST5]] JR
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MIPS32FP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST11:]] FABS_D64
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32FP64-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
+;
+; MM-LABEL: abs_d:
+; MM: # %bb.0:
+; MM-NEXT: jr $ra # <MCInst #[[#MCINST8]] JR_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MM-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST12:]] FABS_D32_MM
+; MM-NEXT: # <MCOperand Reg:[[#MCREG10:]]>
+; MM-NEXT: # <MCOperand Reg:[[#MCREG11:]]>>
+;
+; MMFP64-LABEL: abs_d:
+; MMFP64: # %bb.0:
+; MMFP64-NEXT: jr $ra # <MCInst #[[#MCINST8]] JR_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
+; MMFP64-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST13:]] FABS_D64_MM
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
+; MMFP64-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
+;
+; MMR6-LABEL: abs_d:
+; MMR6: # %bb.0:
+; MMR6-NEXT: abs.d $f0, $f12 # <MCInst #[[#MCINST13:]] FABS_D64_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG8:]]>
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG9:]]>>
+; MMR6-NEXT: jrc $ra # <MCInst #[[#MCINST10]] JRC16_MM
+; MMR6-NEXT: # <MCOperand Reg:[[#MCREG4]]>>
%ret = call double @llvm.fabs.f64(double %a)
ret double %ret
}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index f041f202777f61..08aa26bd340396 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
@@ -22,6 +23,44 @@
; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+; CHECK32-LABEL: static_offset(
+; CHECK32: {
+; CHECK32-NEXT: .reg .pred %p<2>;
+; CHECK32-NEXT: .reg .b32 %r<7>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r5, [static_offset_param_2];
+; CHECK32-NEXT: setp.ne.s32 %p1, %r5, 3;
+; CHECK32-NEXT: @%p1 bra $L__BB0_2;
+; CHECK32-NEXT: // %bb.1: // %bb3
+; CHECK32-NEXT: ld.param.u32 %r3, [static_offset_param_0];
+; CHECK32-NEXT: mov.b32 %r4, static_offset_param_1;
+; CHECK32-NEXT: mov.u32 %r1, %r4;
+; CHECK32-NEXT: cvta.to.global.u32 %r2, %r3;
+; CHECK32-NEXT: ld.param.u32 %r6, [%r1+12];
+; CHECK32-NEXT: st.global.u32 [%r2], %r6;
+; CHECK32-NEXT: $L__BB0_2: // %bb6
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: static_offset(
+; CHECK64: {
+; CHECK64-NEXT: .reg .pred %p<2>;
+; CHECK64-NEXT: .reg .b32 %r<3>;
+; CHECK64-NEXT: .reg .b64 %rd<5>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u32 %r1, [static_offset_param_2];
+; CHECK64-NEXT: setp.ne.s32 %p1, %r1, 3;
+; CHECK64-NEXT: @%p1 bra $L__BB0_2;
+; CHECK64-NEXT: // %bb.1: // %bb3
+; CHECK64-NEXT: ld.param.u64 %rd3, [static_offset_param_0];
+; CHECK64-NEXT: mov.b64 %rd4, static_offset_param_1;
+; CHECK64-NEXT: mov.u64 %rd1, %rd4;
+; CHECK64-NEXT: cvta.to.global.u64 %rd2, %rd3;
+; CHECK64-NEXT: ld.param.u32 %r2, [%rd1+12];
+; CHECK64-NEXT: st.global.u32 [%rd2], %r2;
+; CHECK64-NEXT: $L__BB0_2: // %bb6
+; CHECK64-NEXT: ret;
bb:
%tmp = icmp eq i32 %arg2, 3
br i1 %tmp, label %bb3, label %bb6
@@ -56,6 +95,38 @@ bb6: ; preds = %bb3, %bb
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
+; CHECK32-LABEL: dynamic_offset(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b32 %r<9>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r1, [dynamic_offset_param_0];
+; CHECK32-NEXT: mov.b32 %r2, dynamic_offset_param_1;
+; CHECK32-NEXT: mov.u32 %r3, %r2;
+; CHECK32-NEXT: cvta.to.global.u32 %r4, %r1;
+; CHECK32-NEXT: ld.param.u32 %r5, [dynamic_offset_param_2];
+; CHECK32-NEXT: shl.b32 %r6, %r5, 2;
+; CHECK32-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK32-NEXT: ld.param.u32 %r8, [%r7];
+; CHECK32-NEXT: st.global.u32 [%r4], %r8;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: dynamic_offset(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b32 %r<3>;
+; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u64 %rd1, [dynamic_offset_param_0];
+; CHECK64-NEXT: mov.b64 %rd2, dynamic_offset_param_1;
+; CHECK64-NEXT: mov.u64 %rd3, %rd2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: ld.param.u32 %r1, [dynamic_offset_param_2];
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: ld.param.u32 %r2, [%rd6];
+; CHECK64-NEXT: st.global.u32 [%rd4], %r2;
+; CHECK64-NEXT: ret;
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
@@ -81,6 +152,40 @@ bb:
;
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
+; CHECK32-LABEL: gep_bitcast(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b16 %rs<2>;
+; CHECK32-NEXT: .reg .b32 %r<8>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r1, [gep_bitcast_param_0];
+; CHECK32-NEXT: mov.b32 %r2, gep_bitcast_param_1;
+; CHECK32-NEXT: mov.u32 %r3, %r2;
+; CHECK32-NEXT: cvta.to.global.u32 %r4, %r1;
+; CHECK32-NEXT: ld.param.u32 %r5, [gep_bitcast_param_2];
+; CHECK32-NEXT: shl.b32 %r6, %r5, 2;
+; CHECK32-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK32-NEXT: ld.param.u8 %rs1, [%r7];
+; CHECK32-NEXT: st.global.u8 [%r4], %rs1;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: gep_bitcast(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b16 %rs<2>;
+; CHECK64-NEXT: .reg .b32 %r<2>;
+; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u64 %rd1, [gep_bitcast_param_0];
+; CHECK64-NEXT: mov.b64 %rd2, gep_bitcast_param_1;
+; CHECK64-NEXT: mov.u64 %rd3, %rd2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: ld.param.u32 %r1, [gep_bitcast_param_2];
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: ld.param.u8 %rs1, [%rd6];
+; CHECK64-NEXT: st.global.u8 [%rd4], %rs1;
+; CHECK64-NEXT: ret;
bb:
%n64 = sext i32 %n to i64
%gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
@@ -106,6 +211,40 @@ bb:
;
; Function Attrs: nofree norecurse nounwind willreturn mustprogress
define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
+; CHECK32-LABEL: gep_bitcast_asc(
+; CHECK32: {
+; CHECK32-NEXT: .reg .b16 %rs<2>;
+; CHECK32-NEXT: .reg .b32 %r<8>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: ld.param.u32 %r1, [gep_bitcast_asc_param_0];
+; CHECK32-NEXT: mov.b32 %r2, gep_bitcast_asc_param_1;
+; CHECK32-NEXT: mov.u32 %r3, %r2;
+; CHECK32-NEXT: cvta.to.global.u32 %r4, %r1;
+; CHECK32-NEXT: ld.param.u32 %r5, [gep_bitcast_asc_param_2];
+; CHECK32-NEXT: shl.b32 %r6, %r5, 2;
+; CHECK32-NEXT: add.s32 %r7, %r3, %r6;
+; CHECK32-NEXT: ld.param.u8 %rs1, [%r7];
+; CHECK32-NEXT: st.global.u8 [%r4], %rs1;
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: gep_bitcast_asc(
+; CHECK64: {
+; CHECK64-NEXT: .reg .b16 %rs<2>;
+; CHECK64-NEXT: .reg .b32 %r<2>;
+; CHECK64-NEXT: .reg .b64 %rd<7>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: ld.param.u64 %rd1, [gep_bitcast_asc_param_0];
+; CHECK64-NEXT: mov.b64 %rd2, gep_bitcast_asc_param_1;
+; CHECK64-NEXT: mov.u64 %rd3, %rd2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: ld.param.u32 %r1, [gep_bitcast_asc_param_2];
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: ld.param.u8 %rs1, [%rd6];
+; CHECK64-NEXT: st.global.u8 [%rd4], %rs1;
+; CHECK64-NEXT: ret;
bb:
%n64 = sext i32 %n to i64
%gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
@@ -140,6 +279,84 @@ bb:
; Function Attrs: convergent norecurse nounwind mustprogress
define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+; CHECK32-LABEL: pointer_escapes(
+; CHECK32: {
+; CHECK32-NEXT: .local .align 4 .b8 __local_depot4[16];
+; CHECK32-NEXT: .reg .b32 %SP;
+; CHECK32-NEXT: .reg .b32 %SPL;
+; CHECK32-NEXT: .reg .b32 %r<16>;
+; CHECK32-EMPTY:
+; CHECK32-NEXT: // %bb.0: // %bb
+; CHECK32-NEXT: mov.u32 %SPL, __local_depot4;
+; CHECK32-NEXT: ld.param.u32 %r1, [pointer_escapes_param_0];
+; CHECK32-NEXT: add.u32 %r3, %SPL, 0;
+; CHECK32-NEXT: ld.param.u32 %r4, [pointer_escapes_param_2];
+; CHECK32-NEXT: ld.param.u32 %r5, [pointer_escapes_param_1+12];
+; CHECK32-NEXT: ld.param.u32 %r6, [pointer_escapes_param_1+8];
+; CHECK32-NEXT: ld.param.u32 %r7, [pointer_escapes_param_1+4];
+; CHECK32-NEXT: ld.param.u32 %r8, [pointer_escapes_param_1];
+; CHECK32-NEXT: st.local.u32 [%r3], %r8;
+; CHECK32-NEXT: st.local.u32 [%r3+4], %r7;
+; CHECK32-NEXT: st.local.u32 [%r3+8], %r6;
+; CHECK32-NEXT: st.local.u32 [%r3+12], %r5;
+; CHECK32-NEXT: cvta.to.global.u32 %r9, %r1;
+; CHECK32-NEXT: shl.b32 %r10, %r4, 2;
+; CHECK32-NEXT: add.s32 %r11, %r3, %r10;
+; CHECK32-NEXT: cvta.local.u32 %r12, %r11;
+; CHECK32-NEXT: ld.local.u32 %r13, [%r11];
+; CHECK32-NEXT: st.global.u32 [%r9], %r13;
+; CHECK32-NEXT: { // callseq 0, 0
+; CHECK32-NEXT: .param .b32 param0;
+; CHECK32-NEXT: st.param.b32 [param0+0], %r12;
+; CHECK32-NEXT: .param .b32 retval0;
+; CHECK32-NEXT: call.uni (retval0),
+; CHECK32-NEXT: escape,
+; CHECK32-NEXT: (
+; CHECK32-NEXT: param0
+; CHECK32-NEXT: );
+; CHECK32-NEXT: ld.param.b32 %r14, [retval0+0];
+; CHECK32-NEXT: } // callseq 0
+; CHECK32-NEXT: ret;
+;
+; CHECK64-LABEL: pointer_escapes(
+; CHECK64: {
+; CHECK64-NEXT: .local .align 4 .b8 __local_depot4[16];
+; CHECK64-NEXT: .reg .b64 %SP;
+; CHECK64-NEXT: .reg .b64 %SPL;
+; CHECK64-NEXT: .reg .b32 %r<7>;
+; CHECK64-NEXT: .reg .b64 %rd<10>;
+; CHECK64-EMPTY:
+; CHECK64-NEXT: // %bb.0: // %bb
+; CHECK64-NEXT: mov.u64 %SPL, __local_depot4;
+; CHECK64-NEXT: ld.param.u64 %rd1, [pointer_escapes_param_0];
+; CHECK64-NEXT: add.u64 %rd3, %SPL, 0;
+; CHECK64-NEXT: ld.param.u32 %r1, [pointer_escapes_param_2];
+; CHECK64-NEXT: ld.param.u32 %r2, [pointer_escapes_param_1+12];
+; CHECK64-NEXT: ld.param.u32 %r3, [pointer_escapes_param_1+8];
+; CHECK64-NEXT: ld.param.u32 %r4, [pointer_escapes_param_1+4];
+; CHECK64-NEXT: ld.param.u32 %r5, [pointer_escapes_param_1];
+; CHECK64-NEXT: st.local.u32 [%rd3], %r5;
+; CHECK64-NEXT: st.local.u32 [%rd3+4], %r4;
+; CHECK64-NEXT: st.local.u32 [%rd3+8], %r3;
+; CHECK64-NEXT: st.local.u32 [%rd3+12], %r2;
+; CHECK64-NEXT: cvta.to.global.u64 %rd4, %rd1;
+; CHECK64-NEXT: mul.wide.s32 %rd5, %r1, 4;
+; CHECK64-NEXT: add.s64 %rd6, %rd3, %rd5;
+; CHECK64-NEXT: cvta.local.u64 %rd7, %rd6;
+; CHECK64-NEXT: ld.local.u32 %r6, [%rd6];
+; CHECK64-NEXT: st.global.u32 [%rd4], %r6;
+; CHECK64-NEXT: { // callseq 0, 0
+; CHECK64-NEXT: .param .b64 param0;
+; CHECK64-NEXT: st.param.b64 [param0+0], %rd7;
+; CHECK64-NEXT: .param .b64 retval0;
+; CHECK64-NEXT: call.uni (retval0),
+; CHECK64-NEXT: escape,
+; CHECK64-NEXT: (
+; CHECK64-NEXT: param0
+; CHECK64-NEXT: );
+; CHECK64-NEXT: ld.param.b64 %rd8, [retval0+0];
+; CHECK64-NEXT: } // callseq 0
+; CHECK64-NEXT: ret;
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
@@ -164,3 +381,5 @@ declare dso_local ptr @escape(ptr) local_unnamed_addr
!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll
index 77c21564c8aa76..9e311c6833d5eb 100644
--- a/llvm/test/CodeGen/NVPTX/mulwide.ll
+++ b/llvm/test/CodeGen/NVPTX/mulwide.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O3 | FileCheck %s --check-prefix=OPT
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -O0 | FileCheck %s --check-prefix=NOOPT
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -O3 | %ptxas-verify %}
@@ -6,8 +7,6 @@
; OPT-LABEL: @mulwide16
; NOOPT-LABEL: @mulwide16
define i32 @mulwide16(i16 %a, i16 %b) {
-; OPT: mul.wide.s16
-; NOOPT: mul.lo.s32
%val0 = sext i16 %a to i32
%val1 = sext i16 %b to i32
%val2 = mul i32 %val0, %val1
@@ -17,8 +16,6 @@ define i32 @mulwide16(i16 %a, i16 %b) {
; OPT-LABEL: @mulwideu16
; NOOPT-LABEL: @mulwideu16
define i32 @mulwideu16(i16 %a, i16 %b) {
-; OPT: mul.wide.u16
-; NOOPT: mul.lo.s32
%val0 = zext i16 %a to i32
%val1 = zext i16 %b to i32
%val2 = mul i32 %val0, %val1
@@ -28,8 +25,6 @@ define i32 @mulwideu16(i16 %a, i16 %b) {
; OPT-LABEL: @mulwide8
; NOOPT-LABEL: @mulwide8
define i32 @mulwide8(i8 %a, i8 %b) {
-; OPT: mul.wide.s16
-; NOOPT: mul.lo.s32
%val0 = sext i8 %a to i32
%val1 = sext i8 %b to i32
%val2 = mul i32 %val0, %val1
@@ -39,8 +34,6 @@ define i32 @mulwide8(i8 %a, i8 %b) {
; OPT-LABEL: @mulwideu8
; NOOPT-LABEL: @mulwideu8
define i32 @mulwideu8(i8 %a, i8 %b) {
-; OPT: mul.wide.u16
-; NOOPT: mul.lo.s32
%val0 = zext i8 %a to i32
%val1 = zext i8 %b to i32
%val2 = mul i32 %val0, %val1
@@ -50,8 +43,6 @@ define i32 @mulwideu8(i8 %a, i8 %b) {
; OPT-LABEL: @mulwide32
; NOOPT-LABEL: @mulwide32
define i64 @mulwide32(i32 %a, i32 %b) {
-; OPT: mul.wide.s32
-; NOOPT: mul.lo.s64
%val0 = sext i32 %a to i64
%val1 = sext i32 %b to i64
%val2 = mul i64 %val0, %val1
@@ -61,8 +52,6 @@ define i64 @mulwide32(i32 %a, i32 %b) {
; OPT-LABEL: @mulwideu32
; NOOPT-LABEL: @mulwideu32
define i64 @mulwideu32(i32 %a, i32 %b) {
-; OPT: mul.wide.u32
-; NOOPT: mul.lo.s64
%val0 = zext i32 %a to i64
%val1 = zext i32 %b to i64
%val2 = mul i64 %val0, %val1
@@ -72,8 +61,6 @@ define i64 @mulwideu32(i32 %a, i32 %b) {
; OPT-LABEL: @mulwideu7
; NOOPT-LABEL: @mulwideu7
define i64 @mulwideu7(i7 %a, i7 %b) {
-; OPT: mul.wide.u32
-; NOOPT: mul.lo.s64
%val0 = zext i7 %a to i64
%val1 = zext i7 %b to i64
%val2 = mul i64 %val0, %val1
@@ -83,8 +70,6 @@ define i64 @mulwideu7(i7 %a, i7 %b) {
; OPT-LABEL: @mulwides7
; NOOPT-LABEL: @mulwides7
define i64 @mulwides7(i7 %a, i7 %b) {
-; OPT: mul.wide.s32
-; NOOPT: mul.lo.s64
%val0 = sext i7 %a to i64
%val1 = sext i7 %b to i64
%val2 = mul i64 %val0, %val1
@@ -94,8 +79,6 @@ define i64 @mulwides7(i7 %a, i7 %b) {
; OPT-LABEL: @shl30
; NOOPT-LABEL: @shl30
define i64 @shl30(i32 %a) {
-; OPT: mul.wide
-; NOOPT: shl.b64
%conv = sext i32 %a to i64
%shl = shl i64 %conv, 30
ret i64 %shl
@@ -104,9 +87,10 @@ define i64 @shl30(i32 %a) {
; OPT-LABEL: @shl31
; NOOPT-LABEL: @shl31
define i64 @shl31(i32 %a) {
-; OPT-NOT: mul.wide
-; NOOPT-NOT: mul.wide
%conv = sext i32 %a to i64
%shl = shl i64 %conv, 31
ret i64 %shl
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOOPT: {{.*}}
+; OPT: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
index 40a3e9e945a23e..34461914915966 100644
--- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Verifies correctness of load/store of parameters and return values.
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
@@ -383,3 +384,5 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
%r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
ret %s_i8f64p %r
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/PowerPC/coalesce-ext.ll b/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
index 67de45f453d5de..bd726d330dbb7b 100644
--- a/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/llvm/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -1,18 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -verify-machineinstrs -mcpu=g5 -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names < %s | FileCheck %s
; Check that the peephole optimizer knows about sext and zext instructions.
; CHECK: test1sext
define i32 @test1sext(i64 %A, i64 %B, ptr %P, ptr %P2) nounwind {
+; CHECK-LABEL: test1sext:
+; CHECK: # %bb.0:
+; CHECK-NEXT: add r4, r3, r4
+; CHECK-NEXT: extsw r3, r4
+; CHECK-NEXT: std r3, 0(r6)
+; CHECK-NEXT: add r3, r4, r4
+; CHECK-NEXT: stw r4, 0(r5)
+; CHECK-NEXT: blr
%C = add i64 %A, %B
- ; CHECK: add [[SUM:r[0-9]+]], r3, r4
%D = trunc i64 %C to i32
%E = shl i64 %C, 32
%F = ashr i64 %E, 32
- ; CHECK: extsw [[EXT:r[0-9]+]], [[SUM]]
store volatile i64 %F, ptr %P2
- ; CHECK-DAG: std [[EXT]]
store volatile i32 %D, ptr %P
; Reuse low bits of extended register, don't extend live range of SUM.
- ; CHECK-DAG: stw [[SUM]]
%R = add i32 %D, %D
ret i32 %R
}
diff --git a/llvm/test/CodeGen/PowerPC/extsh.ll b/llvm/test/CodeGen/PowerPC/extsh.ll
index f4c83ec9d0cf3e..c057d3f119ddfd 100644
--- a/llvm/test/CodeGen/PowerPC/extsh.ll
+++ b/llvm/test/CodeGen/PowerPC/extsh.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; This should turn into a single extsh
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | grep extsh | count 1
define i32 @test(i32 %X) {
diff --git a/llvm/test/CodeGen/PowerPC/shl_sext.ll b/llvm/test/CodeGen/PowerPC/shl_sext.ll
index cf83944fef8d65..53da81f45ee11d 100644
--- a/llvm/test/CodeGen/PowerPC/shl_sext.ll
+++ b/llvm/test/CodeGen/PowerPC/shl_sext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; This test should not contain a sign extend
; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | not grep extsb
diff --git a/llvm/test/CodeGen/SystemZ/int-abs-01.ll b/llvm/test/CodeGen/SystemZ/int-abs-01.ll
index 053c347c0b7560..7bdf622ed67d1a 100644
--- a/llvm/test/CodeGen/SystemZ/int-abs-01.ll
+++ b/llvm/test/CodeGen/SystemZ/int-abs-01.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test integer absolute.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
@@ -5,8 +6,9 @@
; Test i32->i32 absolute using slt.
define i32 @f1(i32 %val) {
; CHECK-LABEL: f1:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %neg, i32 %val
@@ -16,8 +18,9 @@ define i32 @f1(i32 %val) {
; Test i32->i32 absolute using sle.
define i32 @f2(i32 %val) {
; CHECK-LABEL: f2:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sle i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %neg, i32 %val
@@ -27,8 +30,9 @@ define i32 @f2(i32 %val) {
; Test i32->i32 absolute using sgt.
define i32 @f3(i32 %val) {
; CHECK-LABEL: f3:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sgt i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %val, i32 %neg
@@ -38,8 +42,9 @@ define i32 @f3(i32 %val) {
; Test i32->i32 absolute using sge.
define i32 @f4(i32 %val) {
; CHECK-LABEL: f4:
-; CHECK: lpr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sge i32 %val, 0
%neg = sub i32 0, %val
%res = select i1 %cmp, i32 %val, i32 %neg
@@ -49,8 +54,9 @@ define i32 @f4(i32 %val) {
; Test i32->i64 absolute.
define i64 @f5(i32 %val) {
; CHECK-LABEL: f5:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i64 %ext, 0
%neg = sub i64 0, %ext
@@ -61,8 +67,9 @@ define i64 @f5(i32 %val) {
; Test i32->i64 absolute that uses an "in-register" form of sign extension.
define i64 @f6(i64 %val) {
; CHECK-LABEL: f6:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%trunc = trunc i64 %val to i32
%ext = sext i32 %trunc to i64
%cmp = icmp slt i64 %ext, 0
@@ -74,8 +81,9 @@ define i64 @f6(i64 %val) {
; Test i64 absolute.
define i64 @f7(i64 %val) {
; CHECK-LABEL: f7:
-; CHECK: lpgr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i64 %val, 0
%neg = sub i64 0, %val
%res = select i1 %cmp, i64 %neg, i64 %val
@@ -85,8 +93,9 @@ define i64 @f7(i64 %val) {
; Test another form of f6, which is that produced by InstCombine.
define i64 @f8(i64 %val) {
; CHECK-LABEL: f8:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -98,8 +107,9 @@ define i64 @f8(i64 %val) {
; Try again with sle rather than slt.
define i64 @f9(i64 %val) {
; CHECK-LABEL: f9:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -111,8 +121,9 @@ define i64 @f9(i64 %val) {
; Repeat f8 with the operands reversed.
define i64 @f10(i64 %val) {
; CHECK-LABEL: f10:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -124,8 +135,9 @@ define i64 @f10(i64 %val) {
; Try again with sge rather than sgt.
define i64 @f11(i64 %val) {
; CHECK-LABEL: f11:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -137,8 +149,9 @@ define i64 @f11(i64 %val) {
; Repeat f5 with the comparison on the unextended value.
define i64 @f12(i32 %val) {
; CHECK-LABEL: f12:
-; CHECK: lpgfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpgfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i32 %val, 0
%neg = sub i64 0, %ext
diff --git a/llvm/test/CodeGen/SystemZ/int-cmp-44.ll b/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
index 41ace057706c3c..559dcfe25d237f 100644
--- a/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/llvm/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test that compares are omitted if CC already has the right value
; (z10 version).
;
@@ -11,9 +12,12 @@ declare void @foo()
; First test the EQ case.
define i32 @f1(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f1:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB0_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp eq i32 %res, 0
@@ -30,9 +34,12 @@ exit:
; ...and again with NE.
define i32 @f2(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f2:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB1_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp ne i32 %res, 0
@@ -49,8 +56,12 @@ exit:
; ...and again with SLT.
define i32 @f3(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f3:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: blr %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB2_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp slt i32 %res, 0
@@ -67,8 +78,12 @@ exit:
; ...and again with SLE.
define i32 @f4(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f4:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: bler %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB3_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp sle i32 %res, 0
@@ -85,8 +100,12 @@ exit:
; ...and again with SGT.
define i32 @f5(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f5:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: bhr %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB4_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp sgt i32 %res, 0
@@ -103,8 +122,12 @@ exit:
; ...and again with SGE.
define i32 @f6(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f6:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: bher %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB5_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
%cmp = icmp sge i32 %res, 0
@@ -122,9 +145,12 @@ exit:
; zero even without "nsw".
define i32 @f7(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f7:
-; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: bner %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: s %r2, 0(%r4)
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB6_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%cur = load i32, ptr %dest
%res = sub i32 %a, %cur
@@ -142,8 +168,12 @@ exit:
; ...and again with SLT.
define i32 @f8(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f8:
-; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: blr %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: s %r2, 0(%r4)
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB7_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%cur = load i32, ptr %dest
%res = sub nsw i32 %a, %cur
@@ -162,9 +192,12 @@ exit:
; comparisons with zero.
define i32 @f9(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f9:
-; CHECK: nr %r2, %r3
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nr %r2, %r3
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB8_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, %b
%cmp = icmp ne i32 %res, 0
@@ -181,9 +214,12 @@ exit:
; ...but not for ordered comparisons.
define i32 @f10(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f10:
-; CHECK: nr %r2, %r3
-; CHECK-NEXT: cibl %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nr %r2, %r3
+; CHECK-NEXT: cibl %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB9_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, %b
%cmp = icmp slt i32 %res, 0
@@ -201,9 +237,12 @@ exit:
; comparisons with zero if the immediate covers the whole register.
define i32 @f11(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f11:
-; CHECK: nilf %r2, 100000001
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nilf %r2, 100000001
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB10_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, 100000001
%cmp = icmp ne i32 %res, 0
@@ -221,9 +260,12 @@ exit:
; zero results.
define i32 @f12(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f12:
-; CHECK: nill %r2, 65436
-; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: nill %r2, 65436
+; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB11_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i32 %a, -100
%cmp = icmp ne i32 %res, 0
@@ -240,9 +282,12 @@ exit:
; SRA provides the same CC result as a comparison with zero.
define i32 @f13(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f13:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB12_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp eq i32 %res, 0
@@ -259,9 +304,12 @@ exit:
; ...and again with NE.
define i32 @f14(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f14:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB13_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp ne i32 %res, 0
@@ -278,9 +326,12 @@ exit:
; ...and SLT.
define i32 @f15(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f15:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB14_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp slt i32 %res, 0
@@ -297,9 +348,12 @@ exit:
; ...and SLE.
define i32 @f16(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f16:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: bler %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB15_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp sle i32 %res, 0
@@ -316,9 +370,12 @@ exit:
; ...and SGT.
define i32 @f17(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f17:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB16_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp sgt i32 %res, 0
@@ -335,9 +392,12 @@ exit:
; ...and SGE.
define i32 @f18(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f18:
-; CHECK: sra %r2, 0(%r3)
-; CHECK-NEXT: bher %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sra %r2, 0(%r3)
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB17_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = ashr i32 %a, %b
%cmp = icmp sge i32 %res, 0
@@ -355,9 +415,12 @@ exit:
; Test the EQ case.
define i64 @f19(i64 %a, i64 %b, ptr %dest) {
; CHECK-LABEL: f19:
-; CHECK: risbg %r2, %r3, 0, 190, 0
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: risbg %r2, %r3, 0, 190, 0
+; CHECK-NEXT: ber %r14
+; CHECK-NEXT: .LBB18_1: # %store
+; CHECK-NEXT: stg %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i64 %b, -2
%cmp = icmp eq i64 %res, 0
@@ -374,9 +437,12 @@ exit:
; ...and the SLT case.
define i64 @f20(i64 %a, i64 %b, ptr %dest) {
; CHECK-LABEL: f20:
-; CHECK: risbg %r2, %r3, 0, 190, 0
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: risbg %r2, %r3, 0, 190, 0
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB19_1: # %store
+; CHECK-NEXT: stg %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%res = and i64 %b, -2
%cmp = icmp slt i64 %res, 0
@@ -394,12 +460,15 @@ exit:
; instruction.
define i32 @f21(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f21:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB20_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i32 %a, 1000000
%res = call i32 asm "blah $0", "=r,0" (i32 %add)
@@ -417,12 +486,15 @@ exit:
; ...and again with a CC-clobbering instruction.
define i32 @f22(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f22:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: cibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB21_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i32 %a, 1000000
%res = call i32 asm "blah $0", "=r,0,~{cc}" (i32 %add)
@@ -440,10 +512,13 @@ exit:
; Check that stores do not interfere.
define i32 @f23(i32 %a, i32 %b, ptr %dest1, ptr %dest2) {
; CHECK-LABEL: f23:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: st %r2, 0(%r4)
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB22_1: # %store
+; CHECK-NEXT: st %r3, 0(%r5)
+; CHECK-NEXT: br %r14
entry:
%res = add nsw i32 %a, 1000000
store i32 %res, ptr %dest1
@@ -461,10 +536,25 @@ exit:
; Check that calls do interfere.
define void @f24(ptr %ptr) {
; CHECK-LABEL: f24:
-; CHECK: afi [[REG:%r[0-9]+]], 1000000
-; CHECK-NEXT: brasl %r14, foo at PLT
-; CHECK-NEXT: cijlh [[REG]], 0, .L{{.*}}
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: stmg %r12, %r15, 96(%r15)
+; CHECK-NEXT: .cfi_offset %r12, -64
+; CHECK-NEXT: .cfi_offset %r13, -56
+; CHECK-NEXT: .cfi_offset %r14, -48
+; CHECK-NEXT: .cfi_offset %r15, -40
+; CHECK-NEXT: aghi %r15, -160
+; CHECK-NEXT: .cfi_def_cfa_offset 320
+; CHECK-NEXT: lgr %r13, %r2
+; CHECK-NEXT: lhi %r12, 1
+; CHECK-NEXT: x %r12, 0(%r2)
+; CHECK-NEXT: afi %r12, 1000000
+; CHECK-NEXT: brasl %r14, foo at PLT
+; CHECK-NEXT: cijlh %r12, 0, .LBB23_2
+; CHECK-NEXT: # %bb.1: # %store
+; CHECK-NEXT: st %r12, 0(%r13)
+; CHECK-NEXT: .LBB23_2: # %exit
+; CHECK-NEXT: lmg %r12, %r15, 256(%r15)
+; CHECK-NEXT: br %r14
entry:
%val = load i32, ptr %ptr
%xor = xor i32 %val, 1
@@ -484,12 +574,15 @@ exit:
; Check that inline asms don't interfere if they don't clobber CC.
define void @f25(i32 %a, ptr %ptr) {
; CHECK-LABEL: f25:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blhr %r14
+; CHECK-NEXT: .LBB24_1: # %store
+; CHECK-NEXT: st %r2, 0(%r3)
+; CHECK-NEXT: br %r14
entry:
%add = add nsw i32 %a, 1000000
call void asm sideeffect "blah", "r"(i32 %add)
@@ -507,12 +600,15 @@ exit:
; ...but do interfere if they do clobber CC.
define void @f26(i32 %a, ptr %ptr) {
; CHECK-LABEL: f26:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: ciblh %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB25_1: # %store
+; CHECK-NEXT: st %r2, 0(%r3)
+; CHECK-NEXT: br %r14
entry:
%add = add i32 %a, 1000000
call void asm sideeffect "blah", "r,~{cc}"(i32 %add)
@@ -531,11 +627,14 @@ exit:
; compare input.
define i32 @f27(i32 %a, i32 %b, ptr %dest1, ptr %dest2) {
; CHECK-LABEL: f27:
-; CHECK: afi %r2, 1000000
-; CHECK-NEXT: sr %r3, %r2
-; CHECK-NEXT: st %r3, 0(%r4)
-; CHECK-NEXT: cibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: afi %r2, 1000000
+; CHECK-NEXT: sr %r3, %r2
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: cibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB26_1: # %store
+; CHECK-NEXT: st %r3, 0(%r5)
+; CHECK-NEXT: br %r14
entry:
%add = add nsw i32 %a, 1000000
%sub = sub i32 %b, %add
@@ -554,9 +653,12 @@ exit:
; Make sure that we don't confuse a base register for a destination.
define void @f28(i64 %a, ptr %dest) {
; CHECK-LABEL: f28:
-; CHECK: xi 0(%r2), 15
-; CHECK: cgibe %r2, 0, 0(%r14)
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xi 0(%r2), 15
+; CHECK-NEXT: cgibe %r2, 0, 0(%r14)
+; CHECK-NEXT: .LBB27_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r3)
+; CHECK-NEXT: br %r14
entry:
%ptr = inttoptr i64 %a to ptr
%val = load i8, ptr %ptr
@@ -576,9 +678,12 @@ exit:
; Test that L gets converted to LT where useful.
define i32 @f29(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f29:
-; CHECK: lt %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bler %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lt %r2, 0(%r3,%r2)
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB28_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
@@ -597,9 +702,12 @@ exit:
; Test that LY gets converted to LT where useful.
define i32 @f30(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f30:
-; CHECK: lt %r2, 100000({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bler %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lt %r2, 100000(%r3,%r2)
+; CHECK-NEXT: bler %r14
+; CHECK-NEXT: .LBB29_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add1 = add i64 %base, %index
%add2 = add i64 %add1, 100000
@@ -619,9 +727,12 @@ exit:
; Test that LG gets converted to LTG where useful.
define i64 @f31(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f31:
-; CHECK: ltg %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bher %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltg %r2, 0(%r3,%r2)
+; CHECK-NEXT: bher %r14
+; CHECK-NEXT: .LBB30_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
@@ -640,9 +751,12 @@ exit:
; Test that LGF gets converted to LTGF where useful.
define i64 @f32(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f32:
-; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgf %r2, 0(%r3,%r2)
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB31_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
@@ -662,12 +776,15 @@ exit:
; Test that LR gets converted to LTR where useful.
define i32 @f33(i32 %dummy, i32 %val, ptr %dest) {
; CHECK-LABEL: f33:
-; CHECK: ltr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB32_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r2}"(i32 %val)
%cmp = icmp slt i32 %val, 0
@@ -684,12 +801,15 @@ exit:
; Test that LGR gets converted to LTGR where useful.
define i64 @f34(i64 %dummy, i64 %val, ptr %dest) {
; CHECK-LABEL: f34:
-; CHECK: ltgr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB33_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r2}"(i64 %val)
%cmp = icmp sgt i64 %val, 0
@@ -706,12 +826,15 @@ exit:
; Test that LGFR gets converted to LTGFR where useful.
define i64 @f35(i64 %dummy, i32 %val, ptr %dest) {
; CHECK-LABEL: f35:
-; CHECK: ltgfr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB34_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%ext = sext i32 %val to i64
call void asm sideeffect "blah $0", "{r2}"(i64 %ext)
@@ -730,12 +853,15 @@ exit:
; we need.
define i32 @f36(i32 %val, i32 %dummy, ptr %dest) {
; CHECK-LABEL: f36:
-; CHECK: ltr %r3, %r2
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r3
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltr %r3, %r2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r3
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB35_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r3}"(i32 %val)
%cmp = icmp slt i32 %val, 0
@@ -753,12 +879,15 @@ exit:
; we need.
define i64 @f37(i64 %val, i64 %dummy, ptr %dest) {
; CHECK-LABEL: f37:
-; CHECK: ltgr %r3, %r2
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r3
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgr %r3, %r2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r3
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB36_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
call void asm sideeffect "blah $0", "{r3}"(i64 %val)
%cmp = icmp slt i64 %val, 0
@@ -776,12 +905,15 @@ exit:
; we need.
define i32 @f38(i32 %val, i64 %dummy, ptr %dest) {
; CHECK-LABEL: f38:
-; CHECK: ltgfr %r3, %r2
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r3
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r3, %r2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r3
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: blr %r14
+; CHECK-NEXT: .LBB37_1: # %store
+; CHECK-NEXT: st %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%ext = sext i32 %val to i64
call void asm sideeffect "blah $0", "{r3}"(i64 %ext)
@@ -799,12 +931,15 @@ exit:
; Test f35 for in-register extensions.
define i64 @f39(i64 %dummy, i64 %a, ptr %dest) {
; CHECK-LABEL: f39:
-; CHECK: ltgfr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB38_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%val = trunc i64 %a to i32
%ext = sext i32 %val to i64
@@ -823,12 +958,15 @@ exit:
; ...and again with what InstCombine would produce for f40.
define i64 @f40(i64 %dummy, i64 %a, ptr %dest) {
; CHECK-LABEL: f40:
-; CHECK: ltgfr %r2, %r3
-; CHECK-NEXT: #APP
-; CHECK-NEXT: blah %r2
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgfr %r2, %r3
+; CHECK-NEXT: #APP
+; CHECK-NEXT: blah %r2
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB39_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%shl = shl i64 %a, 32
%ext = ashr i64 %shl, 32
@@ -847,9 +985,12 @@ exit:
; Try a form of f7 in which the subtraction operands are compared directly.
define i32 @f41(i32 %a, i32 %b, ptr %dest) {
; CHECK-LABEL: f41:
-; CHECK: s %r2, 0(%r4)
-; CHECK-NEXT: bner %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: s %r2, 0(%r4)
+; CHECK-NEXT: bner %r14
+; CHECK-NEXT: .LBB40_1: # %store
+; CHECK-NEXT: st %r3, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%cur = load i32, ptr %dest
%res = sub i32 %a, %cur
@@ -867,9 +1008,12 @@ exit:
; A version of f32 that tests the unextended value.
define i64 @f42(i64 %base, i64 %index, ptr %dest) {
; CHECK-LABEL: f42:
-; CHECK: ltgf %r2, 0({{%r2,%r3|%r3,%r2}})
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ltgf %r2, 0(%r3,%r2)
+; CHECK-NEXT: bhr %r14
+; CHECK-NEXT: .LBB41_1: # %store
+; CHECK-NEXT: stg %r2, 0(%r4)
+; CHECK-NEXT: br %r14
entry:
%add = add i64 %base, %index
%ptr = inttoptr i64 %add to ptr
diff --git a/llvm/test/CodeGen/SystemZ/int-mul-10.ll b/llvm/test/CodeGen/SystemZ/int-mul-10.ll
index 539a48622b8baa..3516d645f54707 100644
--- a/llvm/test/CodeGen/SystemZ/int-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/int-mul-10.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test signed high-part i64->i128 multiplications on z14.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 -asm-verbose=0 | FileCheck %s
@@ -6,10 +7,6 @@ declare i64 @foo()
; Check sign-extended multiplication in which only the high part is used.
define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
-; CHECK-LABEL: f1:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mgrk %r2, %r3, %r4
-; CHECK: br %r14
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
%mulx = mul i128 %ax, %bx
@@ -21,11 +18,6 @@ define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
; Check sign-extended multiplication in which only part of the high half
; is used.
define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
-; CHECK-LABEL: f2:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mgrk [[REG:%r[0-9]+]], %r3, %r4
-; CHECK: srlg %r2, [[REG]], 3
-; CHECK: br %r14
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
%mulx = mul i128 %ax, %bx
@@ -37,11 +29,6 @@ define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
; Check sign-extended multiplication in which the result is split into
; high and low halves.
define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
-; CHECK-LABEL: f3:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mgrk %r2, %r3, %r4
-; CHECK: ogr %r2, %r3
-; CHECK: br %r14
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
%mulx = mul i128 %ax, %bx
@@ -54,10 +41,6 @@ define i64 @f3(i64 %dummy, i64 %a, i64 %b) {
; Check MG with no displacement.
define i64 @f4(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f4:
-; CHECK-NOT: {{%r[234]}}
-; CHECK: mg %r2, 0(%r4)
-; CHECK: br %r14
%b = load i64, ptr %src
%ax = sext i64 %a to i128
%bx = sext i64 %b to i128
@@ -69,9 +52,6 @@ define i64 @f4(i64 %dummy, i64 %a, ptr %src) {
; Check the high end of the aligned MG range.
define i64 @f5(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f5:
-; CHECK: mg %r2, 524280(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 65535
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -85,10 +65,6 @@ define i64 @f5(i64 %dummy, i64 %a, ptr %src) {
; Check the next doubleword up, which requires separate address logic.
; Other sequences besides this one would be OK.
define i64 @f6(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f6:
-; CHECK: agfi %r4, 524288
-; CHECK: mg %r2, 0(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 65536
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -101,9 +77,6 @@ define i64 @f6(i64 %dummy, i64 %a, ptr %src) {
; Check the high end of the negative aligned MG range.
define i64 @f7(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f7:
-; CHECK: mg %r2, -8(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 -1
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -116,9 +89,6 @@ define i64 @f7(i64 %dummy, i64 %a, ptr %src) {
; Check the low end of the MG range.
define i64 @f8(i64 %dummy, i64 %a, ptr %src) {
-; CHECK-LABEL: f8:
-; CHECK: mg %r2, -524288(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 -65536
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -132,10 +102,6 @@ define i64 @f8(i64 %dummy, i64 %a, ptr %src) {
; Check the next doubleword down, which needs separate address logic.
; Other sequences besides this one would be OK.
define i64 @f9(ptr %dest, i64 %a, ptr %src) {
-; CHECK-LABEL: f9:
-; CHECK: agfi %r4, -524296
-; CHECK: mg %r2, 0(%r4)
-; CHECK: br %r14
%ptr = getelementptr i64, ptr %src, i64 -65537
%b = load i64, ptr %ptr
%ax = sext i64 %a to i128
@@ -148,9 +114,6 @@ define i64 @f9(ptr %dest, i64 %a, ptr %src) {
; Check that MG allows an index.
define i64 @f10(ptr %dest, i64 %a, i64 %src, i64 %index) {
-; CHECK-LABEL: f10:
-; CHECK: mg %r2, 524287(%r5,%r4)
-; CHECK: br %r14
%add1 = add i64 %src, %index
%add2 = add i64 %add1, 524287
%ptr = inttoptr i64 %add2 to ptr
@@ -163,3 +126,5 @@ define i64 @f10(ptr %dest, i64 %a, i64 %src, i64 %index) {
ret i64 %high
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/SystemZ/int-neg-02.ll b/llvm/test/CodeGen/SystemZ/int-neg-02.ll
index 7f3f6375129aa7..7d62fe743a8b62 100644
--- a/llvm/test/CodeGen/SystemZ/int-neg-02.ll
+++ b/llvm/test/CodeGen/SystemZ/int-neg-02.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; Test negative integer absolute.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
@@ -5,8 +6,9 @@
; Test i32->i32 negative absolute using slt.
define i32 @f1(i32 %val) {
; CHECK-LABEL: f1:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %neg, i32 %val
@@ -17,8 +19,9 @@ define i32 @f1(i32 %val) {
; Test i32->i32 negative absolute using sle.
define i32 @f2(i32 %val) {
; CHECK-LABEL: f2:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sle i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %neg, i32 %val
@@ -29,8 +32,9 @@ define i32 @f2(i32 %val) {
; Test i32->i32 negative absolute using sgt.
define i32 @f3(i32 %val) {
; CHECK-LABEL: f3:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sgt i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %val, i32 %neg
@@ -41,8 +45,9 @@ define i32 @f3(i32 %val) {
; Test i32->i32 negative absolute using sge.
define i32 @f4(i32 %val) {
; CHECK-LABEL: f4:
-; CHECK: lnr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lnr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp sge i32 %val, 0
%neg = sub i32 0, %val
%abs = select i1 %cmp, i32 %val, i32 %neg
@@ -53,8 +58,9 @@ define i32 @f4(i32 %val) {
; Test i32->i64 negative absolute.
define i64 @f5(i32 %val) {
; CHECK-LABEL: f5:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i64 %ext, 0
%neg = sub i64 0, %ext
@@ -67,8 +73,9 @@ define i64 @f5(i32 %val) {
; sign extension.
define i64 @f6(i64 %val) {
; CHECK-LABEL: f6:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%trunc = trunc i64 %val to i32
%ext = sext i32 %trunc to i64
%cmp = icmp slt i64 %ext, 0
@@ -81,8 +88,9 @@ define i64 @f6(i64 %val) {
; Test i64 negative absolute.
define i64 @f7(i64 %val) {
; CHECK-LABEL: f7:
-; CHECK: lngr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngr %r2, %r2
+; CHECK-NEXT: br %r14
%cmp = icmp slt i64 %val, 0
%neg = sub i64 0, %val
%abs = select i1 %cmp, i64 %neg, i64 %val
@@ -93,8 +101,9 @@ define i64 @f7(i64 %val) {
; Test another form of f6, which is that produced by InstCombine.
define i64 @f8(i64 %val) {
; CHECK-LABEL: f8:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -107,8 +116,9 @@ define i64 @f8(i64 %val) {
; Try again with sle rather than slt.
define i64 @f9(i64 %val) {
; CHECK-LABEL: f9:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -121,8 +131,9 @@ define i64 @f9(i64 %val) {
; Repeat f8 with the operands reversed.
define i64 @f10(i64 %val) {
; CHECK-LABEL: f10:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -135,8 +146,9 @@ define i64 @f10(i64 %val) {
; Try again with sge rather than sgt.
define i64 @f11(i64 %val) {
; CHECK-LABEL: f11:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -149,8 +161,9 @@ define i64 @f11(i64 %val) {
; Repeat f8 with the negation coming from swapped operands.
define i64 @f12(i64 %val) {
; CHECK-LABEL: f12:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -162,8 +175,9 @@ define i64 @f12(i64 %val) {
; Likewise f9.
define i64 @f13(i64 %val) {
; CHECK-LABEL: f13:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -175,8 +189,9 @@ define i64 @f13(i64 %val) {
; Likewise f10.
define i64 @f14(i64 %val) {
; CHECK-LABEL: f14:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -188,8 +203,9 @@ define i64 @f14(i64 %val) {
; Likewise f11.
define i64 @f15(i64 %val) {
; CHECK-LABEL: f15:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%shl = shl i64 %val, 32
%ashr = ashr i64 %shl, 32
%neg = sub i64 0, %ashr
@@ -201,8 +217,9 @@ define i64 @f15(i64 %val) {
; Repeat f5 with the comparison on the unextended value.
define i64 @f16(i32 %val) {
; CHECK-LABEL: f16:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i32 %val, 0
%neg = sub i64 0, %ext
@@ -214,8 +231,9 @@ define i64 @f16(i32 %val) {
; And again with the negation coming from swapped operands.
define i64 @f17(i32 %val) {
; CHECK-LABEL: f17:
-; CHECK: lngfr %r2, %r2
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lngfr %r2, %r2
+; CHECK-NEXT: br %r14
%ext = sext i32 %val to i64
%cmp = icmp slt i32 %val, 0
%neg = sub i64 0, %ext
diff --git a/llvm/test/CodeGen/Thumb2/bfx.ll b/llvm/test/CodeGen/Thumb2/bfx.ll
index 9bd8d70275b924..0191b81805fd1d 100644
--- a/llvm/test/CodeGen/Thumb2/bfx.ll
+++ b/llvm/test/CodeGen/Thumb2/bfx.ll
@@ -1,8 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
define i32 @sbfx1(i32 %a) {
-; CHECK: sbfx1
-; CHECK: sbfx r0, r0, #7, #11
+; CHECK-LABEL: sbfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: sbfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = sext i11 %t2 to i32
@@ -10,8 +13,10 @@ define i32 @sbfx1(i32 %a) {
}
define i32 @ubfx1(i32 %a) {
-; CHECK: ubfx1
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx1:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = trunc i32 %t1 to i11
%t3 = zext i11 %t2 to i32
@@ -19,8 +24,10 @@ define i32 @ubfx1(i32 %a) {
}
define i32 @ubfx2(i32 %a) {
-; CHECK: ubfx2
-; CHECK: ubfx r0, r0, #7, #11
+; CHECK-LABEL: ubfx2:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: ubfx r0, r0, #7, #11
+; CHECK-NEXT: bx lr
%t1 = lshr i32 %a, 7
%t2 = and i32 %t1, 2047
ret i32 %t2
diff --git a/llvm/test/CodeGen/VE/Scalar/bitreverse.ll b/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
index 208c207ff51392..e95f10e85de452 100644
--- a/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
+++ b/llvm/test/CodeGen/VE/Scalar/bitreverse.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
declare i128 @llvm.bitreverse.i128(i128)
diff --git a/llvm/test/CodeGen/WebAssembly/conv.ll b/llvm/test/CodeGen/WebAssembly/conv.ll
index cf76548aad17d1..9c7da4b741929a 100644
--- a/llvm/test/CodeGen/WebAssembly/conv.ll
+++ b/llvm/test/CodeGen/WebAssembly/conv.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+nontrapping-fptoint | FileCheck %s
; Test that basic conversion operations assemble as expected.
@@ -354,3 +355,5 @@ define i16 @i16_trunc_sat_u_f64(double %x) {
%a = call i16 @llvm.fptoui.sat.i16.f64(double %x)
ret i16 %a
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll b/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll
index 45080d14dfd29b..a5e81064ae66fd 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mcpu=mvp -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128
@@ -135,3 +136,7 @@ define i64 @sext_inreg_i32_to_i64(<2 x i64> %x) {
%res = ashr i64 %a, 32
ret i64 %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; NO-SIMD128: {{.*}}
+; SIMD128: {{.*}}
diff --git a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
index 4ecb1bc31f2a80..e48618ba7a53d3 100644
--- a/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
+++ b/llvm/test/CodeGen/X86/lvi-hardening-loads.ll
@@ -1,10 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64-ALL
; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown --x86-lvi-load-no-cbranch < %s | FileCheck %s --check-prefix=X64
; RUN: llc -O0 -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --check-prefix=X64-NOOPT
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 @test(ptr %secret, i32 %secret_size) #0 {
-; X64-LABEL: test:
+; X64-ALL-LABEL: test:
+; X64-ALL: # %bb.0: # %entry
+; X64-ALL-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: jmp .LBB0_1
+; X64-ALL-NEXT: .p2align 4, 0x90
+; X64-ALL-NEXT: .LBB0_4: # %if.end
+; X64-ALL-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-ALL-NEXT: incl -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: .LBB0_1: # %for.cond
+; X64-ALL-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-ALL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: jge .LBB0_5
+; X64-ALL-NEXT: # %bb.2: # %for.body
+; X64-ALL-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-ALL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movl %eax, %ecx
+; X64-ALL-NEXT: shrl $31, %ecx
+; X64-ALL-NEXT: addl %eax, %ecx
+; X64-ALL-NEXT: andl $-2, %ecx
+; X64-ALL-NEXT: cmpl %ecx, %eax
+; X64-ALL-NEXT: jne .LBB0_4
+; X64-ALL-NEXT: # %bb.3: # %if.then
+; X64-ALL-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-ALL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movq (%rax,%rcx,8), %rax
+; X64-ALL-NEXT: lfence
+; X64-ALL-NEXT: movl (%rax), %eax
+; X64-ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-ALL-NEXT: jmp .LBB0_4
+; X64-ALL-NEXT: .LBB0_5: # %for.end
+; X64-ALL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-ALL-NEXT: retq
+;
+; X64-NOOPT-LABEL: test:
+; X64-NOOPT: # %bb.0: # %entry
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: .LBB0_1: # %for.cond
+; X64-NOOPT-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jge .LBB0_6
+; X64-NOOPT-NEXT: # %bb.2: # %for.body
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: movl $2, %ecx
+; X64-NOOPT-NEXT: cltd
+; X64-NOOPT-NEXT: idivl %ecx
+; X64-NOOPT-NEXT: cmpl $0, %edx
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jne .LBB0_4
+; X64-NOOPT-NEXT: # %bb.3: # %if.then
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movq (%rax,%rcx,8), %rax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl (%rax), %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: .LBB0_4: # %if.end
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jmp .LBB0_5
+; X64-NOOPT-NEXT: .LBB0_5: # %for.inc
+; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: addl $1, %eax
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: jmp .LBB0_1
+; X64-NOOPT-NEXT: .LBB0_6: # %for.end
+; X64-NOOPT-NEXT: lfence
+; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NOOPT-NEXT: retq
entry:
%secret.addr = alloca ptr, align 8
%secret_size.addr = alloca i32, align 4
@@ -17,23 +118,7 @@ entry:
store i32 0, ptr %i, align 4
br label %for.cond
-; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: lfence
-; X64-NEXT: movl $0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: jmp .LBB0_1
-
-; X64-NOOPT: # %bb.0: # %entry
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+
for.cond: ; preds = %for.inc, %entry
%0 = load i32, ptr %i, align 4
@@ -41,22 +126,7 @@ for.cond: ; preds = %for.inc, %entry
%cmp = icmp slt i32 %0, %1
br i1 %cmp, label %for.body, label %for.end
-; X64: .LBB0_1: # %for.cond
-; X64-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-ALL-NEXT: lfence
-; X64-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
-; X64-ALL-NEXT: lfence
-; X64-NEXT: jge .LBB0_5
-
-; X64-NOOPT: .LBB0_1: # %for.cond
-; X64-NOOPT-NEXT: # =>This Inner Loop Header: Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: jge .LBB0_6
+
for.body: ; preds = %for.cond
%2 = load i32, ptr %i, align 4
@@ -64,27 +134,7 @@ for.body: ; preds = %for.cond
%cmp1 = icmp eq i32 %rem, 0
br i1 %cmp1, label %if.then, label %if.end
-; X64: # %bb.2: # %for.body
-; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-ALL-NEXT: lfence
-; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: shrl $31, %ecx
-; X64-NEXT: addl %eax, %ecx
-; X64-NEXT: andl $-2, %ecx
-; X64-NEXT: cmpl %ecx, %eax
-; X64-NEXT: jne .LBB0_4
-
-; X64-NOOPT: # %bb.2: # %for.body
-; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: movl $2, %ecx
-; X64-NOOPT-NEXT: cltd
-; X64-NOOPT-NEXT: idivl %ecx
-; X64-NOOPT-NEXT: cmpl $0, %edx
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: jne .LBB0_4
+
if.then: ; preds = %for.body
%3 = load ptr, ptr %secret.addr, align 8
@@ -96,30 +146,7 @@ if.then: ; preds = %for.body
store i32 %6, ptr %ret_val, align 4
br label %if.end
-; X64: # %bb.3: # %if.then
-; X64-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: lfence
-; X64-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: lfence
-; X64-NEXT: movq (%rax,%rcx,8), %rax
-; X64-NEXT: lfence
-; X64-NEXT: movl (%rax), %eax
-; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT: jmp .LBB0_4
-
-; X64-NOOPT: # %bb.3: # %if.then
-; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movq (%rax,%rcx,8), %rax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl (%rax), %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+
if.end: ; preds = %if.then, %for.body
br label %for.inc
@@ -130,15 +157,6 @@ for.inc: ; preds = %if.end
store i32 %inc, ptr %i, align 4
br label %for.cond
-; X64-NOOPT: .LBB0_5: # %for.inc
-; X64-NOOPT-NEXT: # in Loop: Header=BB0_1 Depth=1
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; X64-NOOPT-NEXT: addl $1, %eax
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NOOPT-NEXT: lfence
-; X64-NOOPT-NEXT: jmp .LBB0_1
for.end: ; preds = %for.cond
%8 = load i32, ptr %ret_val, align 4
@@ -150,3 +168,5 @@ declare void @llvm.x86.sse2.lfence() #1
attributes #0 = { "target-features"="+lvi-load-hardening" }
attributes #1 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sext-subreg.ll b/llvm/test/CodeGen/X86/sext-subreg.ll
index 3e54f24d13affe..20451ff208cc05 100644
--- a/llvm/test/CodeGen/X86/sext-subreg.ll
+++ b/llvm/test/CodeGen/X86/sext-subreg.ll
@@ -1,16 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; rdar://7529457
define i64 @t(i64 %A, i64 %B, ptr %P, ptr%P2) nounwind {
; CHECK-LABEL: t:
-; CHECK: movslq %e{{.*}}, %rax
-; CHECK: movq %rax
-; CHECK: movl %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: movl %edi, (%rdx)
+; CHECK-NEXT: movslq %edi, %rax
+; CHECK-NEXT: movq %rax, (%rcx)
+; CHECK-NEXT: movl %eax, (%rdx)
+; CHECK-NEXT: retq
%C = add i64 %A, %B
%D = trunc i64 %C to i32
store volatile i32 %D, ptr %P
%E = shl i64 %C, 32
- %F = ashr i64 %E, 32
+ %F = ashr i64 %E, 32
store volatile i64 %F, ptr%P2
store volatile i32 %D, ptr %P
ret i64 undef
diff --git a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
index 6ebaeee3669713..ae8d450d1345b9 100644
--- a/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
+++ b/llvm/test/CodeGen/X86/x86-64-extend-shift.ll
@@ -1,8 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
; Formerly there were two shifts.
define i64 @baz(i32 %A) nounwind {
-; CHECK: shlq $49, %r
+; CHECK-LABEL: baz:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: shlq $49, %rax
+; CHECK-NEXT: retq
%tmp1 = shl i32 %A, 17
%tmp2 = zext i32 %tmp1 to i64
%tmp3 = shl i64 %tmp2, 32
More information about the llvm-commits
mailing list