[llvm] b02b5b7 - [AMDGPU] Simplify use of hasMovrel and hasVGPRIndexMode (#105680)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 23 01:59:25 PDT 2024
Author: Jay Foad
Date: 2024-08-23T09:59:19+01:00
New Revision: b02b5b7b598ff146f8d5ed529412236533429403
URL: https://github.com/llvm/llvm-project/commit/b02b5b7b598ff146f8d5ed529412236533429403
DIFF: https://github.com/llvm/llvm-project/commit/b02b5b7b598ff146f8d5ed529412236533429403.diff
LOG: [AMDGPU] Simplify use of hasMovrel and hasVGPRIndexMode (#105680)
The generic subtarget has neither of these features. Rather than forcing
HasMovrel on, it is simpler to expand dynamic vector indexing to a
sequence of compare/select instructions.
NFC for real subtargets.
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index b3872a6374261b..352994e541fc88 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -143,14 +143,8 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
if (LDSBankCount == 0)
LDSBankCount = 32;
- if (TT.getArch() == Triple::amdgcn) {
- if (LocalMemorySize == 0)
- LocalMemorySize = 32768;
-
- // Do something sensible for unspecified target.
- if (!HasMovrel && !HasVGPRIndexMode)
- HasMovrel = true;
- }
+ if (TT.getArch() == Triple::amdgcn && LocalMemorySize == 0)
+ LocalMemorySize = 32768;
AddressableLocalMemorySize = LocalMemorySize;
@@ -366,7 +360,7 @@ bool GCNSubtarget::hasMadF16() const {
}
bool GCNSubtarget::useVGPRIndexMode() const {
- return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
+ return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
}
bool GCNSubtarget::useAA() const { return UseAA; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c954c0aa71f734..ecd4451c504727 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13350,12 +13350,15 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
// On some architectures (GFX9) movrel is not available and it's better
// to expand.
- if (!Subtarget->hasMovrel())
+ if (Subtarget->useVGPRIndexMode())
return NumInsts <= 16;
// If movrel is available, use it instead of expanding for vector of 8
// elements.
- return NumInsts <= 15;
+ if (Subtarget->hasMovrel())
+ return NumInsts <= 15;
+
+ return true;
}
bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index c130eb04d02370..a33142fd0ab1f3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,4 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GENERIC %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s
@@ -8,6 +9,75 @@
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
+; GENERIC-LABEL: extract_w_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s6, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 10
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 14
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_w_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -201,6 +271,65 @@ entry:
; XXX: Could do v_or_b32 directly
define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
+; GENERIC-LABEL: extract_w_offset_salu_use_vector:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dword s20, s[2:3], 0xb
+; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s20, s20, 1
+; GENERIC-NEXT: s_or_b32 s2, s19, 16
+; GENERIC-NEXT: s_or_b32 s18, s18, 15
+; GENERIC-NEXT: s_or_b32 s17, s17, 14
+; GENERIC-NEXT: s_or_b32 s16, s16, 13
+; GENERIC-NEXT: s_or_b32 s15, s15, 12
+; GENERIC-NEXT: s_or_b32 s14, s14, 11
+; GENERIC-NEXT: s_or_b32 s13, s13, 10
+; GENERIC-NEXT: s_or_b32 s12, s12, 9
+; GENERIC-NEXT: s_or_b32 s11, s11, 8
+; GENERIC-NEXT: s_or_b32 s10, s10, 7
+; GENERIC-NEXT: s_or_b32 s9, s9, 6
+; GENERIC-NEXT: s_or_b32 s8, s8, 5
+; GENERIC-NEXT: s_or_b32 s7, s7, 4
+; GENERIC-NEXT: s_or_b32 s6, s6, 3
+; GENERIC-NEXT: s_or_b32 s4, s4, 1
+; GENERIC-NEXT: s_or_b32 s5, s5, 2
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 1
+; GENERIC-NEXT: s_cselect_b32 s4, s5, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 2
+; GENERIC-NEXT: s_cselect_b32 s4, s6, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 3
+; GENERIC-NEXT: s_cselect_b32 s4, s7, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 4
+; GENERIC-NEXT: s_cselect_b32 s4, s8, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 5
+; GENERIC-NEXT: s_cselect_b32 s4, s9, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 6
+; GENERIC-NEXT: s_cselect_b32 s4, s10, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 7
+; GENERIC-NEXT: s_cselect_b32 s4, s11, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 8
+; GENERIC-NEXT: s_cselect_b32 s4, s12, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 9
+; GENERIC-NEXT: s_cselect_b32 s4, s13, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 10
+; GENERIC-NEXT: s_cselect_b32 s4, s14, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 11
+; GENERIC-NEXT: s_cselect_b32 s4, s15, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 12
+; GENERIC-NEXT: s_cselect_b32 s4, s16, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 13
+; GENERIC-NEXT: s_cselect_b32 s4, s17, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 14
+; GENERIC-NEXT: s_cselect_b32 s4, s18, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s2, s4
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_w_offset_salu_use_vector:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -494,6 +623,74 @@ entry:
}
define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
+; GENERIC-LABEL: extract_wo_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s6, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 10
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 14
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_wo_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -679,6 +876,50 @@ entry:
}
define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
+; GENERIC-LABEL: extract_neg_offset_sgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s2, s4, 0xfffffe00
+; GENERIC-NEXT: s_cmp_eq_u32 s2, 1
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 2
+; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GENERIC-NEXT: v_readfirstlane_b32 s4, v0
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 2
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 3
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 3
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 4
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 5
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 6
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 6
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 7
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 7
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 8
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 8
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 9
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 9
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 10
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 10
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 11
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 11
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 12
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 12
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 13
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 13
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 14
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 14
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 15
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 16
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_neg_offset_sgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -866,6 +1107,66 @@ entry:
}
define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
+; GENERIC-LABEL: extract_neg_offset_sgpr_loaded:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x39
+; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_addk_i32 s20, 0xfe00
+; GENERIC-NEXT: s_or_b32 s2, s19, s51
+; GENERIC-NEXT: s_or_b32 s18, s18, s50
+; GENERIC-NEXT: s_or_b32 s17, s17, s49
+; GENERIC-NEXT: s_or_b32 s16, s16, s48
+; GENERIC-NEXT: s_or_b32 s15, s15, s47
+; GENERIC-NEXT: s_or_b32 s14, s14, s46
+; GENERIC-NEXT: s_or_b32 s13, s13, s45
+; GENERIC-NEXT: s_or_b32 s12, s12, s44
+; GENERIC-NEXT: s_or_b32 s11, s11, s43
+; GENERIC-NEXT: s_or_b32 s10, s10, s42
+; GENERIC-NEXT: s_or_b32 s9, s9, s41
+; GENERIC-NEXT: s_or_b32 s8, s8, s40
+; GENERIC-NEXT: s_or_b32 s7, s7, s39
+; GENERIC-NEXT: s_or_b32 s6, s6, s38
+; GENERIC-NEXT: s_or_b32 s4, s4, s36
+; GENERIC-NEXT: s_or_b32 s5, s5, s37
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 1
+; GENERIC-NEXT: s_cselect_b32 s4, s5, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 2
+; GENERIC-NEXT: s_cselect_b32 s4, s6, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 3
+; GENERIC-NEXT: s_cselect_b32 s4, s7, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 4
+; GENERIC-NEXT: s_cselect_b32 s4, s8, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 5
+; GENERIC-NEXT: s_cselect_b32 s4, s9, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 6
+; GENERIC-NEXT: s_cselect_b32 s4, s10, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 7
+; GENERIC-NEXT: s_cselect_b32 s4, s11, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 8
+; GENERIC-NEXT: s_cselect_b32 s4, s12, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 9
+; GENERIC-NEXT: s_cselect_b32 s4, s13, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 10
+; GENERIC-NEXT: s_cselect_b32 s4, s14, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 11
+; GENERIC-NEXT: s_cselect_b32 s4, s15, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 12
+; GENERIC-NEXT: s_cselect_b32 s4, s16, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 13
+; GENERIC-NEXT: s_cselect_b32 s4, s17, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 14
+; GENERIC-NEXT: s_cselect_b32 s4, s18, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s2, s4
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_neg_offset_sgpr_loaded:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -1161,6 +1462,46 @@ entry:
}
define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
+; GENERIC-LABEL: extract_neg_offset_vgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GENERIC-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 16, v1, vcc
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_neg_offset_vgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
@@ -1458,6 +1799,18 @@ entry:
; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GENERIC-LABEL: extract_undef_offset_sgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s7, 0xf000
+; GENERIC-NEXT: s_mov_b32 s6, -1
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_mov_b32 s4, s2
+; GENERIC-NEXT: s_mov_b32 s5, s3
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_undef_offset_sgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
@@ -1513,6 +1866,10 @@ entry:
; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GENERIC-LABEL: insert_undef_offset_sgpr_vector_src:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_undef_offset_sgpr_vector_src:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_endpgm
@@ -1536,6 +1893,84 @@ entry:
}
define amdgpu_kernel void @insert_w_offset(ptr addrspace(1) %out, i32 %in) {
+; GENERIC-LABEL: insert_w_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s4, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_w_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -1813,6 +2248,85 @@ entry:
}
define amdgpu_kernel void @insert_unsigned_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
+; GENERIC-LABEL: insert_unsigned_base_plus_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_and_b32 s4, s4, 0xffff
+; GENERIC-NEXT: s_add_i32 s4, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_unsigned_base_plus_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -2091,6 +2605,85 @@ entry:
}
define amdgpu_kernel void @insert_signed_base_plus_offset(ptr addrspace(1) %out, i16 %in) {
+; GENERIC-LABEL: insert_signed_base_plus_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_sext_i32_i16 s4, s4
+; GENERIC-NEXT: s_add_i32 s4, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_signed_base_plus_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -2376,6 +2969,83 @@ entry:
; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
; of the tied implicit use and def of the super register.
define amdgpu_kernel void @insert_wo_offset(ptr addrspace(1) %out, i32 %in) {
+; GENERIC-LABEL: insert_wo_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41880000
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v13, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v14, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v15, 0x41800000
+; GENERIC-NEXT: v_mov_b32_e32 v16, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v17, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v18, 0x41500000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1.0, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v4, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v5, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v15, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v16, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v17, v10, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v18, v10, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_wo_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -2646,6 +3316,71 @@ entry:
}
define amdgpu_kernel void @insert_neg_offset_sgpr(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %offset) {
+; GENERIC-LABEL: insert_neg_offset_sgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xd
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s6, s4, 0xfffffe00
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 0
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 3
+; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 3
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 2
+; GENERIC-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GENERIC-NEXT: s_cselect_b32 s5, 16, 2
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
+; GENERIC-NEXT: v_mov_b32_e32 v3, s4
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 7
+; GENERIC-NEXT: v_mov_b32_e32 v2, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 16, 7
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 6
+; GENERIC-NEXT: v_mov_b32_e32 v1, s4
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 6
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 5
+; GENERIC-NEXT: v_mov_b32_e32 v7, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 16, 5
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 4
+; GENERIC-NEXT: v_mov_b32_e32 v6, s4
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 4
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 11
+; GENERIC-NEXT: v_mov_b32_e32 v5, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 16, 11
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 10
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 9
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 16, 9
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 8
+; GENERIC-NEXT: v_mov_b32_e32 v6, s4
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 8
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 15
+; GENERIC-NEXT: v_mov_b32_e32 v5, s5
+; GENERIC-NEXT: s_cselect_b32 s5, 16, 15
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 14
+; GENERIC-NEXT: v_mov_b32_e32 v4, s4
+; GENERIC-NEXT: s_cselect_b32 s4, 16, 14
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 13
+; GENERIC-NEXT: s_cselect_b32 s7, 16, 13
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 12
+; GENERIC-NEXT: s_cselect_b32 s6, 16, 12
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s5
+; GENERIC-NEXT: v_mov_b32_e32 v6, s4
+; GENERIC-NEXT: v_mov_b32_e32 v5, s7
+; GENERIC-NEXT: v_mov_b32_e32 v4, s6
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_neg_offset_sgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
@@ -2930,6 +3665,71 @@ entry:
; The vector indexed into is originally loaded into an SGPR rather
; than built with a reg_sequence
define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(ptr addrspace(1) %in, ptr addrspace(1) %out, <16 x i32> %vec, i32 %offset) {
+; GENERIC-LABEL: insert_neg_offset_sgpr_loadreg:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0xb
+; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29
+; GENERIC-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x19
+; GENERIC-NEXT: s_mov_b32 s19, 0xf000
+; GENERIC-NEXT: s_mov_b32 s18, -1
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_addk_i32 s20, 0xfe00
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 3
+; GENERIC-NEXT: s_cselect_b32 s3, s3, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 2
+; GENERIC-NEXT: s_cselect_b32 s2, s2, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 1
+; GENERIC-NEXT: v_mov_b32_e32 v3, s3
+; GENERIC-NEXT: s_cselect_b32 s1, s1, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 0
+; GENERIC-NEXT: v_mov_b32_e32 v2, s2
+; GENERIC-NEXT: s_cselect_b32 s0, s0, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 7
+; GENERIC-NEXT: v_mov_b32_e32 v1, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s7, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 6
+; GENERIC-NEXT: v_mov_b32_e32 v0, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s6, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 5
+; GENERIC-NEXT: v_mov_b32_e32 v7, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s5, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 4
+; GENERIC-NEXT: v_mov_b32_e32 v6, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s4, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 11
+; GENERIC-NEXT: v_mov_b32_e32 v5, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s11, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 10
+; GENERIC-NEXT: v_mov_b32_e32 v4, s0
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16
+; GENERIC-NEXT: s_cselect_b32 s0, s10, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 9
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s9, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 8
+; GENERIC-NEXT: v_mov_b32_e32 v6, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s8, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 15
+; GENERIC-NEXT: v_mov_b32_e32 v5, s1
+; GENERIC-NEXT: s_cselect_b32 s1, s15, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 14
+; GENERIC-NEXT: v_mov_b32_e32 v4, s0
+; GENERIC-NEXT: s_cselect_b32 s0, s14, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 13
+; GENERIC-NEXT: s_cselect_b32 s2, s13, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s20, 12
+; GENERIC-NEXT: s_cselect_b32 s3, s12, 5
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:32
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_mov_b32_e32 v7, s1
+; GENERIC-NEXT: v_mov_b32_e32 v6, s0
+; GENERIC-NEXT: v_mov_b32_e32 v5, s2
+; GENERIC-NEXT: v_mov_b32_e32 v4, s3
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_neg_offset_sgpr_loadreg:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
@@ -3170,6 +3970,51 @@ entry:
}
define amdgpu_kernel void @insert_neg_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GENERIC-LABEL: insert_neg_offset_vgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_add_i32_e32 v12, vcc, 0xfffffe00, v0
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v3, 4, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v2, 3, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v1, 2, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v0, 1, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v7, 8, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v6, 7, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v5, 6, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v4, 5, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v11, 12, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v10, 11, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v9, 10, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v8, 9, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v15, 16, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v14, 15, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, 14, 33, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; GENERIC-NEXT: v_cndmask_b32_e64 v12, 13, 33, vcc
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_neg_offset_vgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
@@ -3611,6 +4456,52 @@ entry:
}
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GENERIC-LABEL: insert_neg_inline_offset_vgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_add_i32_e32 v12, vcc, -16, v0
+; GENERIC-NEXT: v_mov_b32_e32 v16, 0x1f4
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 4, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, 3, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 1, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, 8, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, 7, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, 6, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 5, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, 12, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, 11, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, 10, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, 9, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v15, 16, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, 15, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, 14, v16, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v12
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, 13, v16, vcc
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_neg_inline_offset_vgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
@@ -4057,6 +4948,98 @@ entry:
; When the block is split to insert the loop, make sure any other
; places that need to be expanded in the same block are also handled.
define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) {
+; GENERIC-LABEL: extract_vgpr_offset_multiple_in_block:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GENERIC-NEXT: s_mov_b32 s11, 0xf000
+; GENERIC-NEXT: s_mov_b32 s6, 0
+; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0
+; GENERIC-NEXT: s_mov_b32 s7, s11
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s10, -1
+; GENERIC-NEXT: ;;#ASMSTART
+; GENERIC-NEXT: s_mov_b32 s4, 17
+; GENERIC-NEXT: ;;#ASMEND
+; GENERIC-NEXT: v_add_i32_e32 v2, vcc, 1, v1
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GENERIC-NEXT: v_cndmask_b32_e64 v3, 7, 9, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 11, v3, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GENERIC-NEXT: v_cndmask_b32_e64 v4, 7, 9, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 13, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 11, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 5, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 13, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 6, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 7, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 6, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 8, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 7, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 9, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 8, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 10, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 9, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 11, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 10, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 12, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 11, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 13, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 12, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 14, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 13, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 15, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 14, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v1
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 16, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 15, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, 16, v3, vcc
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_store_dword v1, off, s[8:11], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GENERIC-NEXT: s_cbranch_execz .LBB16_2
+; GENERIC-NEXT: ; %bb.1: ; %bb1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: .LBB16_2: ; %bb2
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_vgpr_offset_multiple_in_block:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4717,6 +5700,122 @@ bb2:
}
define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in, <16 x i32> %vec0) {
+; GENERIC-LABEL: insert_vgpr_offset_multiple_in_block:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[24:25], s[2:3], 0xd
+; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_mov_b32 s23, 0xf000
+; GENERIC-NEXT: s_mov_b32 s26, 0
+; GENERIC-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0
+; GENERIC-NEXT: s_mov_b32 s27, s23
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_load_dword v2, v[1:2], s[24:27], 0 addr64 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_load_dwordx2 s[20:21], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s22, -1
+; GENERIC-NEXT: ;;#ASMSTART
+; GENERIC-NEXT: v_mov_b32 v1, 62
+; GENERIC-NEXT: ;;#ASMEND
+; GENERIC-NEXT: v_mov_b32_e32 v3, s16
+; GENERIC-NEXT: v_mov_b32_e32 v4, s17
+; GENERIC-NEXT: v_mov_b32_e32 v5, s18
+; GENERIC-NEXT: v_mov_b32_e32 v6, s19
+; GENERIC-NEXT: v_mov_b32_e32 v7, s12
+; GENERIC-NEXT: v_mov_b32_e32 v8, s13
+; GENERIC-NEXT: v_mov_b32_e32 v9, s14
+; GENERIC-NEXT: v_mov_b32_e32 v10, s15
+; GENERIC-NEXT: v_mov_b32_e32 v11, s8
+; GENERIC-NEXT: v_mov_b32_e32 v12, s9
+; GENERIC-NEXT: v_mov_b32_e32 v13, s10
+; GENERIC-NEXT: v_mov_b32_e32 v14, s11
+; GENERIC-NEXT: v_mov_b32_e32 v15, s4
+; GENERIC-NEXT: v_mov_b32_e32 v16, s5
+; GENERIC-NEXT: v_mov_b32_e32 v17, s6
+; GENERIC-NEXT: v_mov_b32_e32 v18, s7
+; GENERIC-NEXT: v_add_i32_e32 v19, vcc, 1, v2
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 12, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v20, v3, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 13, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v21, v4, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 14, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v22, v5, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 15, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v23, v6, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 8, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v24, v7, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 9, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v25, v8, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 10, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v26, v9, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 11, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v11, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v12, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v13, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v14, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, v15, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, v16, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v17, v1, vcc
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v18, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, 63, v2, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, 63, v4, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 1, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, 63, v3, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, 63, v11, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, 63, v9, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, 63, v8, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, 63, v7, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, 63, v6, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, 63, v10, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, 63, v26, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, 63, v25, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, 63, v24, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v17, 63, v23, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v16, 63, v22, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v15, 63, v21, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v19
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, 63, v20, vcc
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_store_dwordx4 v[14:17], off, s[20:23], 0 offset:48
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], off, s[20:23], 0 offset:32
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[20:23], 0 offset:16
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_store_dwordx4 v[2:5], off, s[20:23], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GENERIC-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GENERIC-NEXT: s_cbranch_execz .LBB17_2
+; GENERIC-NEXT: ; %bb.1: ; %bb1
+; GENERIC-NEXT: buffer_store_dword v1, off, s[20:23], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: .LBB17_2: ; %bb2
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_vgpr_offset_multiple_in_block:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s28, SCRATCH_RSRC_DWORD0
@@ -5530,6 +6629,136 @@ bb2:
; Avoid inserting extra v_mov from copies within the vgpr indexing sequence. The
; gpr_idx mode switching sequence is expanded late for this reason.
define amdgpu_kernel void @insert_w_offset_multiple_in_block(ptr addrspace(1) %out1, i32 %in) {
+; GENERIC-LABEL: insert_w_offset_multiple_in_block:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41500000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41880000
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v3, 0x41800000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v15, 0x40400000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s5, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 14
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 9
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 10
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v9, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 5
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, v10, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, v12, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, 1.0, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 1
+; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, 2.0, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v15, v15, v8, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s5, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v16, 4.0, v8, vcc
+; GENERIC-NEXT: s_add_i32 s4, s4, 2
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
+; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v16, v8, v16, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v15, v8, v15, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, v8, v14, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GENERIC-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_w_offset_multiple_in_block:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -5998,6 +7227,40 @@ entry:
; Make sure we don't hit use of undefined register errors when expanding an
; extract with undef index.
define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) {
+; GENERIC-LABEL: extract_adjacent_blocks:
+; GENERIC: ; %bb.0: ; %bb
+; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_cmp_lg_u32 s0, 0
+; GENERIC-NEXT: s_cbranch_scc0 .LBB19_4
+; GENERIC-NEXT: ; %bb.1: ; %bb4
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: ;;#ASMSTART
+; GENERIC-NEXT: ; reg use v[0:3]
+; GENERIC-NEXT: ;;#ASMEND
+; GENERIC-NEXT: s_mov_b64 vcc, exec
+; GENERIC-NEXT: s_cbranch_execnz .LBB19_3
+; GENERIC-NEXT: .LBB19_2: ; %bb1
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: ;;#ASMSTART
+; GENERIC-NEXT: ; reg use v[0:3]
+; GENERIC-NEXT: ;;#ASMEND
+; GENERIC-NEXT: .LBB19_3: ; %bb7
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_endpgm
+; GENERIC-NEXT: .LBB19_4:
+; GENERIC-NEXT: s_mov_b64 vcc, 0
+; GENERIC-NEXT: s_branch .LBB19_2
+;
; NOOPT-LABEL: extract_adjacent_blocks:
; NOOPT: ; %bb.0: ; %bb
; NOOPT-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
@@ -6210,6 +7473,40 @@ bb7:
}
define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) {
+; GENERIC-LABEL: insert_adjacent_blocks:
+; GENERIC: ; %bb.0: ; %bb
+; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_cmp_lg_u32 s0, 0
+; GENERIC-NEXT: s_cbranch_scc0 .LBB20_4
+; GENERIC-NEXT: ; %bb.1: ; %bb4
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: ;;#ASMSTART
+; GENERIC-NEXT: ; reg use v[0:3]
+; GENERIC-NEXT: ;;#ASMEND
+; GENERIC-NEXT: s_mov_b64 vcc, exec
+; GENERIC-NEXT: s_cbranch_execnz .LBB20_3
+; GENERIC-NEXT: .LBB20_2: ; %bb1
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: ;;#ASMSTART
+; GENERIC-NEXT: ; reg use v[0:3]
+; GENERIC-NEXT: ;;#ASMEND
+; GENERIC-NEXT: .LBB20_3: ; %bb7
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_endpgm
+; GENERIC-NEXT: .LBB20_4:
+; GENERIC-NEXT: s_mov_b64 vcc, 0
+; GENERIC-NEXT: s_branch .LBB20_2
+;
; NOOPT-LABEL: insert_adjacent_blocks:
; NOOPT: ; %bb.0: ; %bb
; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
@@ -6430,6 +7727,24 @@ bb7:
; FIXME: Should be able to fold zero input to movreld to inline imm?
define amdgpu_kernel void @multi_same_block(i32 %arg) {
+; GENERIC-LABEL: multi_same_block:
+; GENERIC: ; %bb.0: ; %bb
+; GENERIC-NEXT: s_load_dword s0, s[2:3], 0x9
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x41900000
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x41b0cccd
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s2, s0, -16
+; GENERIC-NEXT: s_cmp_eq_u32 s2, 1
+; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v0, v0, 4.0, s[0:1]
+; GENERIC-NEXT: s_cmp_eq_u32 s2, 5
+; GENERIC-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v1, v1, -4.0, s[0:1]
+; GENERIC-NEXT: s_mov_b32 m0, -1
+; GENERIC-NEXT: ds_write_b32 v0, v0
+; GENERIC-NEXT: ds_write_b32 v0, v1
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: multi_same_block:
; NOOPT: ; %bb.0: ; %bb
; NOOPT-NEXT: s_load_dword s0, s[2:3], 0x9
@@ -6596,6 +7911,76 @@ bb:
; offset puts outside of superegister bounaries, so clamp to 1st element.
define amdgpu_kernel void @extract_largest_inbounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
+; GENERIC-LABEL: extract_largest_inbounds_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s7, 0xf000
+; GENERIC-NEXT: s_mov_b32 s6, -1
+; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd
+; GENERIC-NEXT: s_mov_b32 s2, s6
+; GENERIC-NEXT: s_mov_b32 s3, s7
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_mov_b32 s4, s8
+; GENERIC-NEXT: s_mov_b32 s5, s9
+; GENERIC-NEXT: s_mov_b32 s0, s10
+; GENERIC-NEXT: s_mov_b32 s1, s11
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_add_i32 s12, s12, 15
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 2
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 3
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 4
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 5
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 6
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 7
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 8
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 9
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 10
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 11
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 12
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 13
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 14
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 15
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_largest_inbounds_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -6790,6 +8175,76 @@ entry:
}
define amdgpu_kernel void @extract_out_of_bounds_offset(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) {
+; GENERIC-LABEL: extract_out_of_bounds_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s7, 0xf000
+; GENERIC-NEXT: s_mov_b32 s6, -1
+; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd
+; GENERIC-NEXT: s_mov_b32 s2, s6
+; GENERIC-NEXT: s_mov_b32 s3, s7
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_mov_b32 s4, s8
+; GENERIC-NEXT: s_mov_b32 s5, s9
+; GENERIC-NEXT: s_mov_b32 s0, s10
+; GENERIC-NEXT: s_mov_b32 s1, s11
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_add_i32 s12, s12, 16
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 2
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 3
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 4
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 5
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 6
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 7
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 8
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 9
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 10
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 11
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 12
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 13
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 14
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s12, 15
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_out_of_bounds_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -6984,6 +8439,77 @@ entry:
}
define amdgpu_kernel void @extractelement_v16i32_or_index(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx.in) {
+; GENERIC-LABEL: extractelement_v16i32_or_index:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s7, 0xf000
+; GENERIC-NEXT: s_mov_b32 s6, -1
+; GENERIC-NEXT: s_load_dword s12, s[2:3], 0xd
+; GENERIC-NEXT: s_mov_b32 s2, s6
+; GENERIC-NEXT: s_mov_b32 s3, s7
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_mov_b32 s4, s8
+; GENERIC-NEXT: s_mov_b32 s5, s9
+; GENERIC-NEXT: s_mov_b32 s0, s10
+; GENERIC-NEXT: s_mov_b32 s1, s11
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_lshl_b32 s0, s12, 2
+; GENERIC-NEXT: s_or_b32 s0, s0, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 2
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 3
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 4
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 5
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 6
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 7
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 8
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 9
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 10
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 11
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 12
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 13
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 14
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_cmp_eq_u32 s0, 15
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extractelement_v16i32_or_index:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -7179,6 +8705,89 @@ entry:
}
define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out, <16 x float> %a, i32 %idx.in) nounwind {
+; GENERIC-LABEL: insertelement_v16f32_or_index:
+; GENERIC: ; %bb.0:
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x29
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x40a00000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_lshl_b32 s20, s20, 2
+; GENERIC-NEXT: v_mov_b32_e32 v0, s7
+; GENERIC-NEXT: v_mov_b32_e32 v1, s6
+; GENERIC-NEXT: v_mov_b32_e32 v4, s5
+; GENERIC-NEXT: v_mov_b32_e32 v5, s4
+; GENERIC-NEXT: v_mov_b32_e32 v6, s11
+; GENERIC-NEXT: v_mov_b32_e32 v8, s10
+; GENERIC-NEXT: v_mov_b32_e32 v9, s9
+; GENERIC-NEXT: v_mov_b32_e32 v11, s8
+; GENERIC-NEXT: v_mov_b32_e32 v12, s15
+; GENERIC-NEXT: v_mov_b32_e32 v13, s14
+; GENERIC-NEXT: v_mov_b32_e32 v14, s13
+; GENERIC-NEXT: v_mov_b32_e32 v15, s12
+; GENERIC-NEXT: v_mov_b32_e32 v16, s19
+; GENERIC-NEXT: v_mov_b32_e32 v17, s18
+; GENERIC-NEXT: v_mov_b32_e32 v18, s17
+; GENERIC-NEXT: v_mov_b32_e32 v19, s16
+; GENERIC-NEXT: s_or_b32 s4, s20, 1
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v2, v10, v1, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, v10, v4, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v5, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v6, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v8, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v9, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v13, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, v10, v15, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, v10, v16, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, v10, v17, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, v10, v18, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insertelement_v16f32_or_index:
; NOOPT: ; %bb.0:
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -7429,6 +9038,34 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(ptr addrspace(1) %out,
}
define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) {
+; GENERIC-LABEL: broken_phi_bb:
+; GENERIC: ; %bb.0: ; %bb
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s6, 8
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: s_branch .LBB26_2
+; GENERIC-NEXT: .LBB26_1: ; %Flow
+; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1
+; GENERIC-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GENERIC-NEXT: s_cbranch_vccz .LBB26_4
+; GENERIC-NEXT: .LBB26_2: ; %bb2
+; GENERIC-NEXT: ; =>This Inner Loop Header: Depth=1
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_cmp_ge_i32 s6, s0
+; GENERIC-NEXT: s_mov_b64 s[4:5], -1
+; GENERIC-NEXT: ; implicit-def: $sgpr6
+; GENERIC-NEXT: s_cbranch_scc1 .LBB26_1
+; GENERIC-NEXT: ; %bb.3: ; %bb4
+; GENERIC-NEXT: ; in Loop: Header=BB26_2 Depth=1
+; GENERIC-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_mov_b64 s[4:5], 0
+; GENERIC-NEXT: s_mov_b32 s6, s1
+; GENERIC-NEXT: s_branch .LBB26_1
+; GENERIC-NEXT: .LBB26_4: ; %bb8
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: broken_phi_bb:
; NOOPT: ; %bb.0: ; %bb
; NOOPT-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0
@@ -7846,6 +9483,73 @@ bb8:
}
define amdgpu_cs void @insert_or_disj_index(ptr addrspace(1) %out, ptr addrspace(4) %in, i32 %val, <4 x i32> inreg %desc, i32 inreg %A) {
+; GENERIC-LABEL: insert_or_disj_index:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: v_mov_b32_e32 v2, s4
+; GENERIC-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen
+; GENERIC-NEXT: s_mov_b32 s2, 0
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s0, s2
+; GENERIC-NEXT: s_mov_b32 s1, s2
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: v_readfirstlane_b32 s4, v2
+; GENERIC-NEXT: s_or_b32 s4, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v7, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 1
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v6, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 0
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v5, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v9, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 10
+; GENERIC-NEXT: buffer_store_dwordx4 v[9:12], v[0:1], s[0:3], 0 addr64 offset:16
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v14, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 14
+; GENERIC-NEXT: buffer_store_dwordx4 v[10:13], v[0:1], s[0:3], 0 addr64 offset:32
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: s_waitcnt expcnt(0)
+; GENERIC-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v12, 0, v4, vcc
+; GENERIC-NEXT: s_cmp_eq_u32 s4, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc
+; GENERIC-NEXT: buffer_store_dwordx4 v[11:14], v[0:1], s[0:3], 0 addr64 offset:48
+; GENERIC-NEXT: buffer_store_dwordx4 v[5:8], v[0:1], s[0:3], 0 addr64
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_or_disj_index:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
More information about the llvm-commits
mailing list