[llvm] 41507fe - [GISel] Combine (Scalarize) vector load followed by an element extract.
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 5 21:53:28 PST 2023
Author: Pranav Taneja
Date: 2023-12-06T11:23:23+05:30
New Revision: 41507fe595d0fa3d81e151d70431d51897f8d14d
URL: https://github.com/llvm/llvm-project/commit/41507fe595d0fa3d81e151d70431d51897f8d14d
DIFF: https://github.com/llvm/llvm-project/commit/41507fe595d0fa3d81e151d70431d51897f8d14d.diff
LOG: [GISel] Combine (Scalarize) vector load followed by an element extract.
Added:
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index ba72a3b71ffd7..a4e9c92b48976 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -196,6 +196,10 @@ class CombinerHelper {
/// Match (and (load x), mask) -> zextload x
bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// Combine a G_EXTRACT_VECTOR_ELT of a load into a narrowed
+ /// load.
+ bool matchCombineExtractedVectorLoad(MachineInstr &MI, BuildFnTy &MatchInfo);
+
bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9a84ab80157f3..77db371adaf77 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -259,6 +259,12 @@ def sext_inreg_to_zext_inreg : GICombineRule<
}])
>;
+def combine_extracted_vector_load : GICombineRule<
+ (defs root:$root, build_fn_matchinfo:$matchinfo),
+ (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+ [{ return Helper.matchCombineExtractedVectorLoad(*${root}, ${matchinfo}); }]),
+ (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
+
def combine_indexed_load_store : GICombineRule<
(defs root:$root, indexed_load_store_matchdata:$matchinfo),
(match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -1291,8 +1297,8 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
constant_fold_fp_binop]>;
def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
- extract_vec_elt_combines, combines_for_extload,
- undef_combines, identity_combines, phi_combines,
+ extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load,
+ undef_combines, identity_combines, phi_combines,
simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
reassocs, ptr_add_immed_chain,
shl_ashr_to_sext_inreg, sext_inreg_of_load,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index c2a7c2d011881..491eabc06f387 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1165,6 +1165,101 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
return RealUse;
}
+bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI,
+ BuildFnTy &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+
+ // Check if there is a load that defines the vector being extracted from.
+ auto *LoadMI = getOpcodeDef<GLoad>(MI.getOperand(1).getReg(), MRI);
+ if (!LoadMI)
+ return false;
+
+ Register Vector = MI.getOperand(1).getReg();
+ LLT VecEltTy = MRI.getType(Vector).getElementType();
+ LLT ResultTy = MRI.getType(MI.getOperand(0).getReg());
+
+ assert(ResultTy == VecEltTy);
+
+ // Checking whether we should reduce the load width.
+ if (!MRI.hasOneNonDBGUse(Vector))
+ return false;
+
+ // Check if the defining load is simple.
+ if (!LoadMI->isSimple())
+ return false;
+
+ // If the vector element type is not a multiple of a byte then we are unable
+ // to correctly compute an address to load only the extracted element as a
+ // scalar.
+ if (!VecEltTy.isByteSized())
+ return false;
+
+ // Check if the new load that we are going to create is legal
+ // if we are in the post-legalization phase.
+ MachineMemOperand MMO = LoadMI->getMMO();
+ Align Alignment = MMO.getAlign();
+ MachinePointerInfo PtrInfo;
+ uint64_t Offset;
+
+ // Finding the appropriate PtrInfo if offset is a known constant.
+ // This is required to create the memory operand for the narrowed load.
+ // This machine memory operand object helps us infer about legality
+ // before we proceed to combine the instruction.
+ if (auto CVal = getIConstantVRegVal(Vector, MRI)) {
+ int Elt = CVal->getZExtValue();
+ // FIXME: should be (ABI size)*Elt.
+ Offset = VecEltTy.getSizeInBits() * Elt / 8;
+ PtrInfo = MMO.getPointerInfo().getWithOffset(Offset);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ Offset = VecEltTy.getSizeInBits() / 8;
+ PtrInfo = MachinePointerInfo(MMO.getPointerInfo().getAddrSpace());
+ }
+
+ Alignment = commonAlignment(Alignment, Offset);
+
+ Register VecPtr = LoadMI->getPointerReg();
+ LLT PtrTy = MRI.getType(VecPtr);
+
+ MachineFunction &MF = *MI.getMF();
+ auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, VecEltTy);
+
+ LegalityQuery::MemDesc MMDesc(*NewMMO);
+
+ LegalityQuery Q = {TargetOpcode::G_LOAD, {VecEltTy, PtrTy}, {MMDesc}};
+
+ if (!isLegalOrBeforeLegalizer(Q))
+ return false;
+
+ // Load must be allowed and fast on the target.
+ LLVMContext &C = MF.getFunction().getContext();
+ auto &DL = MF.getDataLayout();
+ unsigned Fast = 0;
+ if (!getTargetLowering().allowsMemoryAccess(C, DL, VecEltTy, *NewMMO,
+ &Fast) ||
+ !Fast)
+ return false;
+
+ Register Result = MI.getOperand(0).getReg();
+ Register Index = MI.getOperand(2).getReg();
+
+ MatchInfo = [=](MachineIRBuilder &B) {
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(B.getMF(), DummyObserver, B);
+ //// Get pointer to the vector element.
+ Register finalPtr = Helper.getVectorElementPointer(
+ LoadMI->getPointerReg(), MRI.getType(LoadMI->getOperand(0).getReg()),
+ Index);
+ // New G_LOAD instruction.
+ B.buildLoad(Result, finalPtr, PtrInfo, Alignment);
+ // Remove original GLOAD instruction.
+ LoadMI->eraseFromParent();
+ };
+
+ return true;
+}
+
bool CombinerHelper::matchCombineIndexedLoadStore(
MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
auto &LdSt = cast<GLoadStore>(MI);
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 0d7620d1c883d..7493afd672d43 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14659,17 +14659,9 @@ define i8 @load_single_extract_variable_index_i8(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i8:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0xf
-; CHECK-GISEL-NEXT: lsl x10, x9, #1
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: sub x9, x10, x9
-; CHECK-GISEL-NEXT: ldrb w0, [x8, x9]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0xf
+; CHECK-GISEL-NEXT: ldrb w0, [x0, x8]
; CHECK-GISEL-NEXT: ret
%lv = load <16 x i8>, ptr %A
%e = extractelement <16 x i8> %lv, i32 %idx
@@ -14692,15 +14684,9 @@ define i16 @load_single_extract_variable_index_i16(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i16:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0x7
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldrh w0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0x7
+; CHECK-GISEL-NEXT: ldrh w0, [x0, x8, lsl #1]
; CHECK-GISEL-NEXT: ret
%lv = load <8 x i16>, ptr %A
%e = extractelement <8 x i16> %lv, i32 %idx
@@ -14717,15 +14703,9 @@ define i32 @load_single_extract_variable_index_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0x3
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0x3
+; CHECK-GISEL-NEXT: ldr w0, [x0, x8, lsl #2]
; CHECK-GISEL-NEXT: ret
%lv = load <4 x i32>, ptr %A
%e = extractelement <4 x i32> %lv, i32 %idx
@@ -14779,14 +14759,8 @@ define i32 @load_single_extract_variable_index_masked_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and w9, w1, #0x3
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: and w8, w1, #0x3
+; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2]
; CHECK-GISEL-NEXT: ret
%idx.x = and i32 %idx, 3
%lv = load <4 x i32>, ptr %A
@@ -14803,14 +14777,8 @@ define i32 @load_single_extract_variable_index_masked2_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked2_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and w9, w1, #0x1
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: and w8, w1, #0x1
+; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2]
; CHECK-GISEL-NEXT: ret
%idx.x = and i32 %idx, 1
%lv = load <4 x i32>, ptr %A
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 69346de9bb798..80dc3dead35ab 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -124,71 +124,71 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX9-LABEL: test_add_mul_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul float %x, %y
@@ -202,71 +202,71 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul float %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 33a4d3c5494f7..a13c60b4e8414 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -8,170 +8,12 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GCN-LABEL: v_extract_v64i32_varidx:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v6, v2
-; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT: s_add_i32 s32, s32, 0x10000
-; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT: v_and_b32_e32 v0, 63, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_add_u32_e32 v0, v1, v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v16, v20
-; GCN-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NEXT: v_mov_b32_e32 v18, v22
-; GCN-NEXT: v_mov_b32_e32 v19, v23
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: v_and_b32_e32 v2, 63, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
@@ -183,174 +25,12 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GCN-LABEL: v_extract_v128i16_varidx:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v6, v2
-; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT: s_add_i32 s32, s32, 0x10000
-; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_add_u32_e32 v0, v1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v16, v20
-; GCN-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NEXT: v_mov_b32_e32 v18, v22
-; GCN-NEXT: v_mov_b32_e32 v19, v23
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b32 s33, s4
-; GCN-NEXT: s_waitcnt vmcnt(16)
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT: v_and_b32_e32 v2, 0x7f, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: global_load_ushort v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%vec = load <128 x i16>, ptr addrspace(1) %ptr
@@ -362,171 +42,12 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GCN-LABEL: v_extract_v32i64_varidx:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v6, v2
-; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT: s_add_i32 s32, s32, 0x10000
-; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT: v_and_b32_e32 v0, 31, v6
-; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT: v_add_u32_e32 v1, v2, v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v16, v20
-; GCN-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NEXT: v_mov_b32_e32 v18, v22
-; GCN-NEXT: v_mov_b32_e32 v19, v23
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: v_and_b32_e32 v2, 31, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%vec = load <32 x i64>, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 786d65f7dcc40..057790617204c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -8,29 +8,36 @@
define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
-; GCN-NEXT: s_lshl_b32 m0, s4, 1
+; GCN-NEXT: s_and_b32 s0, s4, 3
+; GCN-NEXT: s_lshl_b32 s0, s0, 4
+; GCN-NEXT: s_ashr_i32 s1, s0, 31
+; GCN-NEXT: s_add_u32 s0, s2, s0
+; GCN-NEXT: s_addc_u32 s1, s3, s1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9]
-; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11]
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
-; GFX10-NEXT: s_lshl_b32 m0, s4, 1
+; GFX10-NEXT: s_and_b32 s0, s4, 3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9]
-; GFX10-NEXT: s_movrels_b64 s[2:3], s[10:11]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[8:23], s[2:3], 0x0
-; GFX11-NEXT: s_lshl_b32 m0, s4, 1
+; GFX11-NEXT: s_and_b32 s0, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 4
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_movrels_b64 s[0:1], s[8:9]
-; GFX11-NEXT: s_movrels_b64 s[2:3], s[10:11]
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -40,46 +47,32 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre
define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX9-NEXT: s_lshl_b32 s0, s2, 1
-; GFX9-NEXT: s_lshl_b32 s2, s0, 1
+; GFX9-NEXT: s_and_b32 s0, s2, 3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 4
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-NEXT: v_mov_b32_e32 v18, v2
-; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v3
-; GFX9-NEXT: s_set_gpr_idx_off
-; GFX9-NEXT: v_readfirstlane_b32 s2, v18
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
-; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11]
-; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
-; GFX8-NEXT: s_lshl_b32 s0, s2, 1
-; GFX8-NEXT: s_lshl_b32 m0, s0, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 3
+; GFX8-NEXT: s_lshl_b32 s0, s0, 4
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_movrels_b32_e32 v1, v3
-; GFX8-NEXT: v_movrels_b32_e32 v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
@@ -88,20 +81,13 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
;
; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: s_lshl_b32 s0, s2, 1
-; GFX7-NEXT: s_lshl_b32 m0, s0, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 3
+; GFX7-NEXT: s_lshl_b32 s0, s0, 4
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_movrels_b32_e32 v1, v3
-; GFX7-NEXT: v_movrels_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: v_readfirstlane_b32 s2, v2
@@ -110,44 +96,38 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
;
; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX10-NEXT: s_lshl_b32 s0, s2, 1
-; GFX10-NEXT: s_lshl_b32 m0, s0, 1
+; GFX10-NEXT: s_and_b32 s0, s2, 3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_movrels_b32_e32 v1, v3
-; GFX10-NEXT: v_movrels_b32_e32 v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
-; GFX11-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32
-; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48
-; GFX11-NEXT: s_lshl_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_lshl_b32 m0, s0, 1
+; GFX11-NEXT: s_and_b32 s0, s2, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 4
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_movrels_b32_e32 v0, v2
-; GFX11-NEXT: v_movrels_b32_e32 v1, v3
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(1) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -158,298 +138,66 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_add_u32_e32 v16, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8]
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, 0xf000
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v12, v14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v13, v15, s4
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v5, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, v5, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
-; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32
-; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
-; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i128>, ptr addrspace(1) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -459,68 +207,15 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_add_u32_e32 v19, 1, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-NEXT: v_mov_b32_e32 v8, s7
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT: v_mov_b32_e32 v9, s8
-; GFX9-NEXT: v_mov_b32_e32 v10, s9
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX9-NEXT: v_mov_b32_e32 v11, s10
-; GFX9-NEXT: v_mov_b32_e32 v12, s11
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX9-NEXT: v_mov_b32_e32 v13, s12
-; GFX9-NEXT: v_mov_b32_e32 v14, s13
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX9-NEXT: v_mov_b32_e32 v15, s14
-; GFX9-NEXT: v_mov_b32_e32 v16, s15
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_readfirstlane_b32 s2, v2
@@ -529,68 +224,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v7, s6
-; GFX8-NEXT: v_mov_b32_e32 v8, s7
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT: v_mov_b32_e32 v9, s8
-; GFX8-NEXT: v_mov_b32_e32 v10, s9
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX8-NEXT: v_mov_b32_e32 v11, s10
-; GFX8-NEXT: v_mov_b32_e32 v12, s11
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX8-NEXT: v_mov_b32_e32 v13, s12
-; GFX8-NEXT: v_mov_b32_e32 v14, s13
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX8-NEXT: v_mov_b32_e32 v15, s14
-; GFX8-NEXT: v_mov_b32_e32 v16, s15
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
@@ -599,68 +241,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: v_mov_b32_e32 v3, s2
-; GFX7-NEXT: v_mov_b32_e32 v4, s3
-; GFX7-NEXT: v_mov_b32_e32 v5, s4
-; GFX7-NEXT: v_mov_b32_e32 v6, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_mov_b32_e32 v8, s7
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v9, s8
-; GFX7-NEXT: v_mov_b32_e32 v10, s9
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX7-NEXT: v_mov_b32_e32 v11, s10
-; GFX7-NEXT: v_mov_b32_e32 v12, s11
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX7-NEXT: v_mov_b32_e32 v13, s12
-; GFX7-NEXT: v_mov_b32_e32 v14, s13
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX7-NEXT: v_mov_b32_e32 v15, s14
-; GFX7-NEXT: v_mov_b32_e32 v16, s15
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: v_readfirstlane_b32 s2, v2
@@ -669,54 +258,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-NEXT: v_mov_b32_e32 v3, s7
-; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
+; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
@@ -725,63 +275,18 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_add_nc_u32 v1, 1, v0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
@@ -793,19 +298,19 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
@@ -814,34 +319,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s2, s6
-; GCN-NEXT: s_mov_b32 s3, s7
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s4
-; GFX10-NEXT: s_mov_b32 s1, s5
-; GFX10-NEXT: s_mov_b32 s2, s6
-; GFX10-NEXT: s_mov_b32 s3, s7
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s4
-; GFX11-NEXT: s_mov_b32 s1, s5
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 1
@@ -849,34 +354,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s8
-; GCN-NEXT: s_mov_b32 s1, s9
-; GCN-NEXT: s_mov_b32 s2, s10
-; GCN-NEXT: s_mov_b32 s3, s11
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s8
-; GFX10-NEXT: s_mov_b32 s1, s9
-; GFX10-NEXT: s_mov_b32 s2, s10
-; GFX10-NEXT: s_mov_b32 s3, s11
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x20
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s8
-; GFX11-NEXT: s_mov_b32 s1, s9
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 2
@@ -884,34 +389,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s12
-; GCN-NEXT: s_mov_b32 s1, s13
-; GCN-NEXT: s_mov_b32 s2, s14
-; GCN-NEXT: s_mov_b32 s3, s15
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xc
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s12
-; GFX10-NEXT: s_mov_b32 s1, s13
-; GFX10-NEXT: s_mov_b32 s2, s14
-; GFX10-NEXT: s_mov_b32 s3, s15
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x30
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s12
-; GFX11-NEXT: s_mov_b32 s1, s13
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 7028d1157787f..6d772df3fa281 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -6,42 +6,74 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s2, s4, 1
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_and_b32 s1, s4, 1
-; GCN-NEXT: s_lshl_b32 s1, s1, 4
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 3
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_and_b32 s2, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, s2, 1
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s2, s4, 1
-; GFX10-NEXT: s_cmp_eq_u32 s2, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 1
-; GFX10-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 3
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s2, s4, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 4
+; GFX11-NEXT: s_and_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 %idx
@@ -51,71 +83,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg
define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 1
-; GFX9-NEXT: s_and_b32 s1, s2, 1
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_lshl_b32 s0, s1, 4
+; GFX9-NEXT: s_and_b32 s0, s2, 3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 1
-; GFX8-NEXT: s_and_b32 s1, s2, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_lshl_b32 s0, s1, 4
+; GFX8-NEXT: s_and_b32 s0, s2, 3
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 1
-; GFX7-NEXT: s_and_b32 s1, s2, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_lshl_b32 s0, s1, 4
+; GFX7-NEXT: s_and_b32 s0, s2, 3
+; GFX7-NEXT: s_lshl_b32 s0, s0, 1
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 1
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT: s_and_b32 s0, s2, 1
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-NEXT: s_and_b32 s0, s2, 3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX11-NEXT: s_and_b32 s0, s2, 1
-; GFX11-NEXT: s_lshl_b32 s0, s0, 4
+; GFX11-NEXT: s_and_b32 s0, s2, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(1) %ptr
@@ -127,70 +159,66 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 %idx
@@ -198,48 +226,74 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
@@ -248,22 +302,48 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 0
@@ -271,25 +351,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 1
@@ -297,25 +402,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 4
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 2
@@ -323,25 +453,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 6
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:6
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 3
@@ -352,14 +507,14 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -369,21 +524,21 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
@@ -395,17 +550,17 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
@@ -414,25 +569,22 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 1
@@ -443,17 +595,17 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
@@ -462,25 +614,22 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 2
@@ -491,17 +640,17 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
@@ -510,25 +659,22 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 3
@@ -536,54 +682,74 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s5, s4, 1
-; GCN-NEXT: s_cmp_eq_u32 s5, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 2
-; GCN-NEXT: s_cselect_b32 s0, s2, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 3
-; GCN-NEXT: s_cselect_b32 s0, s3, s0
-; GCN-NEXT: s_and_b32 s1, s4, 1
-; GCN-NEXT: s_lshl_b32 s1, s1, 4
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 7
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 7
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_and_b32 s2, s4, 7
+; GFX7-NEXT: s_lshl_b32 s4, s2, 1
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s5, s4, 1
-; GFX10-NEXT: s_cmp_eq_u32 s5, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 2
-; GFX10-NEXT: s_cselect_b32 s0, s2, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 3
-; GFX10-NEXT: s_cselect_b32 s0, s3, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 1
-; GFX10-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 7
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s5, s4, 1
+; GFX11-NEXT: s_and_b32 s0, s4, 7
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s5, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 2
-; GFX11-NEXT: s_cselect_b32 s0, s2, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: s_cselect_b32 s0, s3, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s1, s1, 4
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 %idx
@@ -593,92 +759,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg
define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 1
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s2, 1
+; GFX9-NEXT: s_and_b32 s0, s2, 7
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT: s_lshl_b32 s0, s1, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s2, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 7
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT: s_lshl_b32 s0, s1, 4
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s2, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 7
+; GFX7-NEXT: s_lshl_b32 s0, s0, 1
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT: s_lshl_b32 s0, s1, 4
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 1
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT: s_and_b32 s0, s2, 7
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT: s_and_b32 s0, s2, 1
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX11-NEXT: s_and_b32 s0, s2, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX11-NEXT: s_and_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(1) %ptr
@@ -690,91 +835,66 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 %idx
@@ -782,64 +902,74 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_and_b32_e32 v0, 7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
@@ -848,22 +978,48 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 0
@@ -871,25 +1027,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 1
@@ -897,25 +1078,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 4
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 2
@@ -923,25 +1129,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 6
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:6
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 3
@@ -949,25 +1180,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s2
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 8
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:8
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s2
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:8
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s2
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:8
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 4
@@ -975,25 +1231,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx5:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:10
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 10
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx5:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s2, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:10
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:10
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 5
@@ -1001,25 +1282,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:12
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 12
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:12
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx6:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:12
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:12
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 6
@@ -1027,25 +1333,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx7:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s3, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:14
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 14
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx7:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:14
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s3, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:14
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 7
@@ -1056,14 +1387,14 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1073,21 +1404,21 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
@@ -1099,17 +1430,17 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx1:
@@ -1118,25 +1449,22 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 1
@@ -1147,17 +1475,17 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx2:
@@ -1166,25 +1494,22 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 2
@@ -1195,17 +1520,17 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx3:
@@ -1214,25 +1539,22 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 3
@@ -1243,17 +1565,17 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx4:
@@ -1262,25 +1584,22 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 4
@@ -1291,17 +1610,17 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:10
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx5:
@@ -1310,25 +1629,22 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:10
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:10
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 5
@@ -1339,17 +1655,17 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx6:
@@ -1358,25 +1674,22 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 6
@@ -1387,17 +1700,17 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx7:
@@ -1406,27 +1719,26 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:14
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:14
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 7
ret i16 %element
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index c7f49d526fac0..c2394ec461490 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -6,32 +6,68 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_and_b32 s1, s4, 3
-; GCN-NEXT: s_lshl_b32 s1, s1, 3
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 3
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 3
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s4, s4, 3
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_and_b32 s1, s4, 3
-; GFX10-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 3
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_and_b32 s1, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s1, s1, 3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 %idx
@@ -41,55 +77,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p
define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_and_b32 s0, s2, 3
-; GFX9-NEXT: s_lshl_b32 s0, s0, 3
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_and_b32 s0, s2, 3
-; GFX8-NEXT: s_lshl_b32 s0, s0, 3
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_and_b32 s0, s2, 3
-; GFX7-NEXT: s_lshl_b32 s0, s0, 3
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_and_b32 s0, s2, 3
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_and_b32 s0, s2, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_lshl_b32 s0, s0, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(1) %ptr
@@ -101,55 +147,60 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 %idx
@@ -159,53 +210,67 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
@@ -214,22 +279,48 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %p
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 0
@@ -237,25 +328,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 8
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 1
@@ -263,25 +379,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 2
@@ -289,25 +430,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 24
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 3
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 3
@@ -318,14 +484,14 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -335,21 +501,21 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
@@ -361,17 +527,17 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_idx1:
@@ -380,25 +546,22 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 1
@@ -409,17 +572,17 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_idx2:
@@ -428,25 +591,22 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 2
@@ -457,17 +617,17 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_idx3:
@@ -476,25 +636,22 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 3
@@ -502,42 +659,68 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s2, s4, 2
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_and_b32 s1, s4, 3
-; GCN-NEXT: s_lshl_b32 s1, s1, 3
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 7
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 7
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s4, s4, 7
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s2, s4, 2
-; GFX10-NEXT: s_cmp_eq_u32 s2, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 3
-; GFX10-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 7
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s2, s4, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 3
-; GFX11-NEXT: s_lshl_b32 s1, s1, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s0, s4, 7
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 %idx
@@ -547,71 +730,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p
define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 2
-; GFX9-NEXT: s_and_b32 s1, s2, 3
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_lshl_b32 s0, s1, 3
+; GFX9-NEXT: s_and_b32 s0, s2, 7
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 2
-; GFX8-NEXT: s_and_b32 s1, s2, 3
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_lshl_b32 s0, s1, 3
+; GFX8-NEXT: s_and_b32 s0, s2, 7
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 2
-; GFX7-NEXT: s_and_b32 s1, s2, 3
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_lshl_b32 s0, s1, 3
+; GFX7-NEXT: s_and_b32 s0, s2, 7
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 2
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT: s_and_b32 s0, s2, 3
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
+; GFX10-NEXT: s_and_b32 s0, s2, 7
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX11-NEXT: s_and_b32 s0, s2, 3
-; GFX11-NEXT: s_lshl_b32 s0, s0, 3
+; GFX11-NEXT: s_and_b32 s0, s2, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(1) %ptr
@@ -623,70 +800,60 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 %idx
@@ -694,48 +861,69 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 3, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
@@ -744,22 +932,48 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 0
@@ -767,25 +981,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 8
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 1
@@ -793,25 +1032,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 2
@@ -819,25 +1083,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 24
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 3
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 3
@@ -845,25 +1134,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 4
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 4
@@ -871,25 +1185,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx5:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 8
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 5
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx5:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 8
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 8
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 5
@@ -897,25 +1236,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 6
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx6:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:6
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 6
@@ -923,25 +1287,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx7:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 24
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:7
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 7
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx7:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:7
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:7
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 7
@@ -952,14 +1341,14 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -969,21 +1358,21 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
@@ -995,17 +1384,17 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx1:
@@ -1014,25 +1403,22 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 1
@@ -1043,17 +1429,17 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx2:
@@ -1062,25 +1448,22 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 2
@@ -1091,17 +1474,17 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx3:
@@ -1110,25 +1493,22 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 3
@@ -1139,17 +1519,17 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx4:
@@ -1158,25 +1538,22 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 4
@@ -1187,17 +1564,17 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx5:
@@ -1206,25 +1583,22 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 5
@@ -1235,17 +1609,17 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx6:
@@ -1254,25 +1628,22 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 6
@@ -1283,17 +1654,17 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 7, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx7:
@@ -1302,25 +1673,22 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:7
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 7
@@ -1328,54 +1696,68 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
}
define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s5, s4, 2
-; GCN-NEXT: s_cmp_eq_u32 s5, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 2
-; GCN-NEXT: s_cselect_b32 s0, s2, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 3
-; GCN-NEXT: s_cselect_b32 s0, s3, s0
-; GCN-NEXT: s_and_b32 s1, s4, 3
-; GCN-NEXT: s_lshl_b32 s1, s1, 3
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 15
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 15
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s4, s4, 15
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s5, s4, 2
-; GFX10-NEXT: s_cmp_eq_u32 s5, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 2
-; GFX10-NEXT: s_cselect_b32 s0, s2, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 3
-; GFX10-NEXT: s_cselect_b32 s0, s3, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 3
-; GFX10-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 15
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s5, s4, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s5, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 2
-; GFX11-NEXT: s_cselect_b32 s0, s2, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: s_cselect_b32 s0, s3, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s1, s1, 3
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s0, s4, 15
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <16 x i8>, ptr addrspace(4) %ptr
%element = extractelement <16 x i8> %vector, i32 %idx
@@ -1385,92 +1767,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %
define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 2
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s2, 3
+; GFX9-NEXT: s_and_b32 s0, s2, 15
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT: s_lshl_b32 s0, s1, 3
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 2
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s2, 3
+; GFX8-NEXT: s_and_b32 s0, s2, 15
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT: s_lshl_b32 s0, s1, 3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 2
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s2, 3
+; GFX7-NEXT: s_and_b32 s0, s2, 15
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT: s_lshl_b32 s0, s1, 3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 2
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT: s_and_b32 s0, s2, 15
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT: s_and_b32 s0, s2, 3
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX11-NEXT: s_and_b32 s0, s2, 15
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX11-NEXT: s_and_b32 s0, s2, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_lshl_b32 s0, s0, 3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1482,91 +1837,60 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 %idx
@@ -1574,64 +1898,69 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
}
define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 3, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <16 x i8>, ptr addrspace(4) %ptr
@@ -1643,14 +1972,14 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1660,21 +1989,21 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1686,17 +2015,17 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx1:
@@ -1705,25 +2034,22 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 1
@@ -1734,17 +2060,17 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx2:
@@ -1753,25 +2079,22 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 2
@@ -1782,17 +2105,17 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx3:
@@ -1801,25 +2124,22 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 3
@@ -1830,17 +2150,17 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx4:
@@ -1849,25 +2169,22 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 4
@@ -1878,17 +2195,17 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx5:
@@ -1897,25 +2214,22 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 5
@@ -1926,17 +2240,17 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx6:
@@ -1945,25 +2259,22 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 6
@@ -1974,17 +2285,17 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 7, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx7:
@@ -1993,25 +2304,22 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:7
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 7
@@ -2022,17 +2330,17 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx8:
@@ -2041,25 +2349,22 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 8
@@ -2070,17 +2375,17 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:9
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx9:
@@ -2089,25 +2394,22 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:9
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:9
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:9
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 9
@@ -2118,17 +2420,17 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:10
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx10:
@@ -2137,25 +2439,22 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:10
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:10
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 10
@@ -2166,17 +2465,17 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:11
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 11, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx11:
@@ -2185,25 +2484,22 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:11
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:11
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 11
@@ -2214,17 +2510,17 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx12:
@@ -2233,25 +2529,22 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 12
@@ -2262,17 +2555,17 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:13
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 13, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx13:
@@ -2281,25 +2574,22 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:13
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:13
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:13
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 13
@@ -2310,17 +2600,17 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx14:
@@ -2329,25 +2619,22 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:14
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:14
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 14
@@ -2358,17 +2645,17 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx15:
@@ -2377,27 +2664,26 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:15
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:15
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:15
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 15
ret i8 %element
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 8bf34caea4051..d4c536bdd5ebe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -4738,35 +4738,31 @@ define i32 @v_extract_v64i32_7(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_7:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:28
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, v7
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: v_extract_v64i32_7:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 28, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v0, v7
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:28
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:28
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 7
@@ -4777,7 +4773,7 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_32:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:128
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
@@ -4786,21 +4782,21 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:128
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
@@ -4812,35 +4808,31 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_33:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:132
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, v1
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: v_extract_v64i32_33:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x84, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v0, v1
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_33:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:132
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_33:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:132
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 33
@@ -4851,35 +4843,31 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_37:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:148
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, v5
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: v_extract_v64i32_37:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x94, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v0, v5
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_37:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:148
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_37:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:144
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:148
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 37
More information about the llvm-commits
mailing list