[llvm] [GISel] Combine vector load followed by an extractelement (PR #72670)
Pranav Taneja via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 17 08:04:38 PST 2023
https://github.com/prtaneja created https://github.com/llvm/llvm-project/pull/72670
Narrow a vector load followed by an extractelement instruction to an element level load.
Similar to scalarizeExtractedVectorLoad function in the DAGCombiner.
>From 07e46dd427ee4cd1b5b232767f33f38dffb6c613 Mon Sep 17 00:00:00 2001
From: Pranav <Pranav.Taneja at amd.com>
Date: Mon, 6 Nov 2023 06:03:00 +0000
Subject: [PATCH] [GISel] Combine vector load followed by extract element into
a narrowed load.
---
.../llvm/CodeGen/GlobalISel/CombinerHelper.h | 5 +
.../include/llvm/Target/GlobalISel/Combine.td | 10 +-
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 134 ++
.../AArch64/arm64-indexed-vector-ldst.ll | 58 +-
.../AMDGPU/GlobalISel/combine-fma-add-mul.ll | 80 +-
.../GlobalISel/extractelement-stack-lower.ll | 515 +----
.../AMDGPU/GlobalISel/extractelement.i128.ll | 899 ++------
.../AMDGPU/GlobalISel/extractelement.i16.ll | 1340 +++++++-----
.../AMDGPU/GlobalISel/extractelement.i8.ll | 1852 +++++++++--------
.../AMDGPU/GlobalISel/extractelement.ll | 50 +-
10 files changed, 2240 insertions(+), 2703 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index ba72a3b71ffd70b..f51401754c3f3fa 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -196,6 +196,11 @@ class CombinerHelper {
/// Match (and (load x), mask) -> zextload x
bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// Combine a G_EXTRACT_VECTOR_ELT of a load into a narrowed
+ /// load.
+ bool matchCombineExtractedVectorLoad(MachineInstr &MI);
+ void applyCombineExtractedVectorLoad(MachineInstr &MI);
+
bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 76b83cc5df073ae..ee8dac06b6a4328 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -259,6 +259,12 @@ def sext_inreg_to_zext_inreg : GICombineRule<
}])
>;
+def combine_extracted_vector_load : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+ [{ return Helper.matchCombineExtractedVectorLoad(*${root}); }]),
+ (apply [{ Helper.applyCombineExtractedVectorLoad(*${root}); }])>;
+
def combine_indexed_load_store : GICombineRule<
(defs root:$root, indexed_load_store_matchdata:$matchinfo),
(match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -1283,8 +1289,8 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
constant_fold_fp_binop]>;
def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
- extract_vec_elt_combines, combines_for_extload,
- undef_combines, identity_combines, phi_combines,
+ extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load,
+ undef_combines, identity_combines, phi_combines,
simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
reassocs, ptr_add_immed_chain,
shl_ashr_to_sext_inreg, sext_inreg_of_load,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 51c268ab77c2220..b0018b990067afb 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1165,6 +1165,140 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
return RealUse;
}
+bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+
+ // Check if there is a load that defines the vector being extracted from.
+ MachineInstr *LoadMI =
+ getOpcodeDef(TargetOpcode::G_LOAD, MI.getOperand(1).getReg(), MRI);
+ if (!LoadMI)
+ return false;
+
+ Register Vector = MI.getOperand(1).getReg();
+ LLT VecEltVT = MRI.getType(Vector).getElementType();
+ LLT ResultVT = MRI.getType(MI.getOperand(0).getReg());
+
+ // Do not combine when result type and vector element type are not the same.
+ if (ResultVT != VecEltVT)
+ return false;
+
+ // Checking whether we should reduce the load width.
+ if (VecEltVT.isVector() || !MRI.hasOneUse(Vector))
+ return false;
+
+ GLoadStore *GLoadMI = cast<GLoadStore>(LoadMI);
+
+ // Check if the defining load is simple.
+ if (!GLoadMI->isSimple())
+ return false;
+
+ // If the vector element type is not a multiple of a byte then we are unable
+ // to correctly compute an address to load only the extracted element as a
+ // scalar.
+ if (!VecEltVT.isByteSized())
+ return false;
+
+ // Check if the new load that we are going to create is legal
+ // if we are in the post-legalization phase.
+ MachineMemOperand MMO = GLoadMI->getMMO();
+ MachinePointerInfo PtrInfo;
+
+ Register Index = MI.getOperand(2).getReg();
+
+ // Finding the appropriate PtrInfo if offset is a known constant.
+ // This is required to create the memory operand for the narrowed load.
+ // This machine memory operand object helps us infer about legality
+ // before we proceed to combine the instruction.
+ if (MRI.getVRegDef(Index)->getOpcode() == TargetOpcode::G_CONSTANT) {
+ MachineInstr *ConstMI = MRI.getVRegDef(Index);
+ const ConstantInt *CVal = ConstMI->getOperand(1).getCImm();
+ int Elt = CVal->getZExtValue();
+ // FIXME: should be (ABI size)*Elt.
+ unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+ PtrInfo = MMO.getPointerInfo().getWithOffset(PtrOff);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ PtrInfo = MachinePointerInfo(MMO.getPointerInfo().getAddrSpace());
+ }
+
+ Register VecPtr = GLoadMI->getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(VecPtr);
+
+ MachineFunction &MF = *MI.getMF();
+ auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, VecEltVT);
+
+ LegalityQuery::MemDesc MMDesc(*NewMMO);
+
+ LegalityQuery Q = {TargetOpcode::G_LOAD, {VecEltVT, PtrTy}, {MMDesc}};
+
+ if (!isLegalOrBeforeLegalizer(Q))
+ return false;
+
+ // Load must be allowed and fast on the target.
+ LLVMContext &C = MF.getFunction().getContext();
+ auto &DL = MF.getDataLayout();
+ unsigned Fast = 0;
+ if (!getTargetLowering().allowsMemoryAccess(C, DL, VecEltVT, *NewMMO,
+ &Fast) ||
+ !Fast)
+ return false;
+
+ return true;
+}
+
+void CombinerHelper::applyCombineExtractedVectorLoad(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+
+ // Get the original load instruction.
+ MachineInstr *LoadMI =
+ getOpcodeDef(TargetOpcode::G_LOAD, MI.getOperand(1).getReg(), MRI);
+
+ GLoadStore *GLoadMI = cast<GLoadStore>(LoadMI);
+
+ Register Index = MI.getOperand(2).getReg();
+ LLT VecEltVT = MRI.getType(GLoadMI->getOperand(0).getReg()).getElementType();
+ Register Result = MI.getOperand(0).getReg();
+
+ Align Alignment = GLoadMI->getMMO().getAlign();
+ uint64_t Offset;
+ MachinePointerInfo PtrInfo;
+
+ // Check if Index to extract element from is constant.
+ if (MRI.getVRegDef(Index)->getOpcode() == TargetOpcode::G_CONSTANT) {
+ MachineInstr *ConstMI = MRI.getVRegDef(Index);
+ const ConstantInt *CVal = ConstMI->getOperand(1).getCImm();
+ int Elt = CVal->getZExtValue();
+ // FIXME: should be (ABI size)*Elt.
+ Offset = VecEltVT.getSizeInBits() * Elt / 8;
+ PtrInfo = GLoadMI->getMMO().getPointerInfo().getWithOffset(Offset);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ Offset = VecEltVT.getSizeInBits() / 8;
+ PtrInfo =
+ MachinePointerInfo(GLoadMI->getMMO().getPointerInfo().getAddrSpace());
+ }
+
+ Alignment = commonAlignment(Alignment, Offset);
+
+ MachineOperand &BasePtr = GLoadMI->getOperand(1);
+ MachineIRBuilder MIRBuilder(MI);
+
+ // Get pointer to the vector element.
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(MIRBuilder.getMF(), DummyObserver, MIRBuilder);
+ Register finalPtr = Helper.getVectorElementPointer(
+ BasePtr.getReg(), MRI.getType(GLoadMI->getOperand(0).getReg()), Index);
+
+ // New G_LOAD instruction.
+ MIRBuilder.buildLoad(Result, finalPtr, PtrInfo, Alignment);
+
+ // Remove original Extract Element MI and GLOAD instructions.
+ MI.eraseFromParent();
+ GLoadMI->eraseFromParent();
+}
+
bool CombinerHelper::matchCombineIndexedLoadStore(
MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
auto &LdSt = cast<GLoadStore>(MI);
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 0d7620d1c883d68..7493afd672d4378 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14659,17 +14659,9 @@ define i8 @load_single_extract_variable_index_i8(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i8:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0xf
-; CHECK-GISEL-NEXT: lsl x10, x9, #1
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: sub x9, x10, x9
-; CHECK-GISEL-NEXT: ldrb w0, [x8, x9]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0xf
+; CHECK-GISEL-NEXT: ldrb w0, [x0, x8]
; CHECK-GISEL-NEXT: ret
%lv = load <16 x i8>, ptr %A
%e = extractelement <16 x i8> %lv, i32 %idx
@@ -14692,15 +14684,9 @@ define i16 @load_single_extract_variable_index_i16(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i16:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0x7
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldrh w0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0x7
+; CHECK-GISEL-NEXT: ldrh w0, [x0, x8, lsl #1]
; CHECK-GISEL-NEXT: ret
%lv = load <8 x i16>, ptr %A
%e = extractelement <8 x i16> %lv, i32 %idx
@@ -14717,15 +14703,9 @@ define i32 @load_single_extract_variable_index_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0x3
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0x3
+; CHECK-GISEL-NEXT: ldr w0, [x0, x8, lsl #2]
; CHECK-GISEL-NEXT: ret
%lv = load <4 x i32>, ptr %A
%e = extractelement <4 x i32> %lv, i32 %idx
@@ -14779,14 +14759,8 @@ define i32 @load_single_extract_variable_index_masked_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and w9, w1, #0x3
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: and w8, w1, #0x3
+; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2]
; CHECK-GISEL-NEXT: ret
%idx.x = and i32 %idx, 3
%lv = load <4 x i32>, ptr %A
@@ -14803,14 +14777,8 @@ define i32 @load_single_extract_variable_index_masked2_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked2_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and w9, w1, #0x1
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: and w8, w1, #0x1
+; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2]
; CHECK-GISEL-NEXT: ret
%idx.x = and i32 %idx, 1
%lv = load <4 x i32>, ptr %A
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 69346de9bb79805..80dc3dead35ab23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -124,71 +124,71 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX9-LABEL: test_add_mul_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul float %x, %y
@@ -202,71 +202,71 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul float %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 33a4d3c5494f7c9..a13c60b4e84143f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -8,170 +8,12 @@ define i32 @v_extract_v64i32_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GCN-LABEL: v_extract_v64i32_varidx:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v6, v2
-; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT: s_add_i32 s32, s32, 0x10000
-; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT: v_and_b32_e32 v0, 63, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_add_u32_e32 v0, v1, v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v16, v20
-; GCN-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NEXT: v_mov_b32_e32 v18, v22
-; GCN-NEXT: v_mov_b32_e32 v19, v23
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: v_and_b32_e32 v2, 63, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
@@ -183,174 +25,12 @@ define i16 @v_extract_v128i16_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GCN-LABEL: v_extract_v128i16_varidx:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v6, v2
-; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT: s_add_i32 s32, s32, 0x10000
-; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT: v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT: v_bfe_u32 v0, v6, 1, 6
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT: v_add_u32_e32 v0, v1, v0
-; GCN-NEXT: v_and_b32_e32 v1, 1, v6
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v1
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v16, v20
-; GCN-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NEXT: v_mov_b32_e32 v18, v22
-; GCN-NEXT: v_mov_b32_e32 v19, v23
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b32 s33, s4
-; GCN-NEXT: s_waitcnt vmcnt(16)
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0
+; GCN-NEXT: v_and_b32_e32 v2, 0x7f, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: global_load_ushort v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%vec = load <128 x i16>, ptr addrspace(1) %ptr
@@ -362,171 +42,12 @@ define i64 @v_extract_v32i64_varidx(ptr addrspace(1) %ptr, i32 %idx) {
; GCN-LABEL: v_extract_v32i64_varidx:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x3fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v6, v2
-; GCN-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GCN-NEXT: global_load_dwordx4 v[16:19], v[0:1], off offset:16
-; GCN-NEXT: global_load_dwordx4 v[56:59], v[0:1], off offset:32
-; GCN-NEXT: global_load_dwordx4 v[48:51], v[0:1], off offset:48
-; GCN-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:64
-; GCN-NEXT: global_load_dwordx4 v[44:47], v[0:1], off offset:80
-; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:96
-; GCN-NEXT: global_load_dwordx4 v[60:63], v[0:1], off offset:112
-; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:144
-; GCN-NEXT: global_load_dwordx4 v[28:31], v[0:1], off offset:160
-; GCN-NEXT: global_load_dwordx4 v[52:55], v[0:1], off offset:176
-; GCN-NEXT: global_load_dwordx4 v[24:27], v[0:1], off offset:192
-; GCN-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:208
-; GCN-NEXT: s_add_i32 s32, s32, 0x10000
-; GCN-NEXT: s_add_i32 s32, s32, 0xffff0000
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:224
-; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:240
-; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:256
-; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:260
-; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:264
-; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:268
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:272
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:276
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:280
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:284
-; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s33 offset:288
-; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s33 offset:292
-; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s33 offset:296
-; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s33 offset:300
-; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s33 offset:304
-; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s33 offset:308
-; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s33 offset:312
-; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s33 offset:316
-; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:320
-; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:324
-; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:328
-; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:332
-; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:336
-; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:340
-; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:344
-; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s33 offset:348
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:352
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:356
-; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:360
-; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:364
-; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s33 offset:368
-; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s33 offset:372
-; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s33 offset:376
-; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s33 offset:380
-; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:400
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:404
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s33 offset:408
-; GCN-NEXT: buffer_store_dword v35, off, s[0:3], s33 offset:412
-; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:416
-; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:420
-; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:424
-; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:428
-; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s33 offset:432
-; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s33 offset:436
-; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s33 offset:440
-; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s33 offset:444
-; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:448
-; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:452
-; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:456
-; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:460
-; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:524 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:528 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:532 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:536 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:540 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:544 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:548 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:552 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:556 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:560 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:564 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
-; GCN-NEXT: v_and_b32_e32 v0, 31, v6
-; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT: v_add_u32_e32 v1, v2, v0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v16, v20
-; GCN-NEXT: v_mov_b32_e32 v17, v21
-; GCN-NEXT: v_mov_b32_e32 v18, v22
-; GCN-NEXT: v_mov_b32_e32 v19, v23
-; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:464
-; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:468
-; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:472
-; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:476
-; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:496
-; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:500
-; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:504
-; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:508
-; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
-; GCN-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: v_and_b32_e32 v2, 31, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%vec = load <32 x i64>, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
index 786d65f7dcc40db..057790617204cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
@@ -8,29 +8,36 @@
define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
-; GCN-NEXT: s_lshl_b32 m0, s4, 1
+; GCN-NEXT: s_and_b32 s0, s4, 3
+; GCN-NEXT: s_lshl_b32 s0, s0, 4
+; GCN-NEXT: s_ashr_i32 s1, s0, 31
+; GCN-NEXT: s_add_u32 s0, s2, s0
+; GCN-NEXT: s_addc_u32 s1, s3, s1
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9]
-; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11]
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
-; GFX10-NEXT: s_lshl_b32 m0, s4, 1
+; GFX10-NEXT: s_and_b32 s0, s4, 3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9]
-; GFX10-NEXT: s_movrels_b64 s[2:3], s[10:11]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[8:23], s[2:3], 0x0
-; GFX11-NEXT: s_lshl_b32 m0, s4, 1
+; GFX11-NEXT: s_and_b32 s0, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 4
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_movrels_b64 s[0:1], s[8:9]
-; GFX11-NEXT: s_movrels_b64 s[2:3], s[10:11]
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -40,46 +47,32 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inre
define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX9-NEXT: s_lshl_b32 s0, s2, 1
-; GFX9-NEXT: s_lshl_b32 s2, s0, 1
+; GFX9-NEXT: s_and_b32 s0, s2, 3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 4
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-NEXT: v_mov_b32_e32 v18, v2
-; GFX9-NEXT: s_set_gpr_idx_off
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
-; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
-; GFX9-NEXT: v_mov_b32_e32 v3, v3
-; GFX9-NEXT: s_set_gpr_idx_off
-; GFX9-NEXT: v_readfirstlane_b32 s2, v18
+; GFX9-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
-; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11]
-; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
-; GFX8-NEXT: s_lshl_b32 s0, s2, 1
-; GFX8-NEXT: s_lshl_b32 m0, s0, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 3
+; GFX8-NEXT: s_lshl_b32 s0, s0, 4
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_movrels_b32_e32 v1, v3
-; GFX8-NEXT: v_movrels_b32_e32 v0, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
@@ -88,20 +81,13 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
;
; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT: s_lshl_b32 s0, s2, 1
-; GFX7-NEXT: s_lshl_b32 m0, s0, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 3
+; GFX7-NEXT: s_lshl_b32 s0, s0, 4
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_movrels_b32_e32 v1, v3
-; GFX7-NEXT: v_movrels_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: v_readfirstlane_b32 s2, v2
@@ -110,44 +96,38 @@ define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr
;
; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x3
-; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
-; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
-; GFX10-NEXT: s_lshl_b32 s0, s2, 1
-; GFX10-NEXT: s_lshl_b32 m0, s0, 1
+; GFX10-NEXT: s_and_b32 s0, s2, 3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_movrels_b32_e32 v1, v3
-; GFX10-NEXT: v_movrels_b32_e32 v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
-; GFX11-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32
-; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48
-; GFX11-NEXT: s_lshl_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_lshl_b32 m0, s0, 1
+; GFX11-NEXT: s_and_b32 s0, s2, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 4
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_movrels_b32_e32 v0, v2
-; GFX11-NEXT: v_movrels_b32_e32 v1, v3
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: v_readfirstlane_b32 s3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
+; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(1) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -158,298 +138,66 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_add_u32_e32 v16, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
-; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0
-; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8]
-; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v17
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s10, 0
-; GFX7-NEXT: s_mov_b32 s11, 0xf000
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
-; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
-; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32
-; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
-; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
-; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v12, v14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v13, v15, s4
-; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v4, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v5, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, v5, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
-; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32
-; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_waitcnt vmcnt(3)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
-; GFX11-NEXT: s_waitcnt vmcnt(2)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i128>, ptr addrspace(1) %ptr
%element = extractelement <4 x i128> %vector, i32 %idx
@@ -459,68 +207,15 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx
define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_add_u32_e32 v19, 1, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v7, s6
-; GFX9-NEXT: v_mov_b32_e32 v8, s7
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT: v_mov_b32_e32 v9, s8
-; GFX9-NEXT: v_mov_b32_e32 v10, s9
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX9-NEXT: v_mov_b32_e32 v11, s10
-; GFX9-NEXT: v_mov_b32_e32 v12, s11
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX9-NEXT: v_mov_b32_e32 v13, s12
-; GFX9-NEXT: v_mov_b32_e32 v14, s13
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX9-NEXT: v_mov_b32_e32 v15, s14
-; GFX9-NEXT: v_mov_b32_e32 v16, s15
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_readfirstlane_b32 s2, v2
@@ -529,68 +224,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v7, s6
-; GFX8-NEXT: v_mov_b32_e32 v8, s7
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT: v_mov_b32_e32 v9, s8
-; GFX8-NEXT: v_mov_b32_e32 v10, s9
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX8-NEXT: v_mov_b32_e32 v11, s10
-; GFX8-NEXT: v_mov_b32_e32 v12, s11
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX8-NEXT: v_mov_b32_e32 v13, s12
-; GFX8-NEXT: v_mov_b32_e32 v14, s13
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX8-NEXT: v_add_u32_e32 v19, vcc, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX8-NEXT: v_mov_b32_e32 v15, s14
-; GFX8-NEXT: v_mov_b32_e32 v16, s15
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_readfirstlane_b32 s2, v2
@@ -599,68 +241,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, s0
-; GFX7-NEXT: v_mov_b32_e32 v2, s1
-; GFX7-NEXT: v_mov_b32_e32 v3, s2
-; GFX7-NEXT: v_mov_b32_e32 v4, s3
-; GFX7-NEXT: v_mov_b32_e32 v5, s4
-; GFX7-NEXT: v_mov_b32_e32 v6, s5
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT: v_mov_b32_e32 v7, s6
-; GFX7-NEXT: v_mov_b32_e32 v8, s7
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT: v_mov_b32_e32 v9, s8
-; GFX7-NEXT: v_mov_b32_e32 v10, s9
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GFX7-NEXT: v_mov_b32_e32 v11, s10
-; GFX7-NEXT: v_mov_b32_e32 v12, s11
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GFX7-NEXT: v_mov_b32_e32 v13, s12
-; GFX7-NEXT: v_mov_b32_e32 v14, s13
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
-; GFX7-NEXT: v_add_i32_e32 v19, vcc, 1, v0
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
-; GFX7-NEXT: v_mov_b32_e32 v15, s14
-; GFX7-NEXT: v_mov_b32_e32 v16, s15
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
-; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
-; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
-; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: v_readfirstlane_b32 s2, v2
@@ -669,54 +258,15 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s6
-; GFX10-NEXT: v_mov_b32_e32 v3, s7
-; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
+; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
@@ -725,63 +275,18 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
;
; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_lshlrev_b32 v0, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_add_nc_u32 v1, 1, v0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
@@ -793,19 +298,19 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inre
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
@@ -814,34 +319,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %p
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mov_b32 s2, s6
-; GCN-NEXT: s_mov_b32 s3, s7
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x4
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x10
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s4
-; GFX10-NEXT: s_mov_b32 s1, s5
-; GFX10-NEXT: s_mov_b32 s2, s6
-; GFX10-NEXT: s_mov_b32 s3, s7
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s4
-; GFX11-NEXT: s_mov_b32 s1, s5
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 1
@@ -849,34 +354,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %p
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s8
-; GCN-NEXT: s_mov_b32 s1, s9
-; GCN-NEXT: s_mov_b32 s2, s10
-; GCN-NEXT: s_mov_b32 s3, s11
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x8
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x20
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s8
-; GFX10-NEXT: s_mov_b32 s1, s9
-; GFX10-NEXT: s_mov_b32 s2, s10
-; GFX10-NEXT: s_mov_b32 s3, s11
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x20
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s8
-; GFX11-NEXT: s_mov_b32 s1, s9
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 2
@@ -884,34 +389,34 @@ define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %p
}
define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i128_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s12
-; GCN-NEXT: s_mov_b32 s1, s13
-; GCN-NEXT: s_mov_b32 s2, s14
-; GCN-NEXT: s_mov_b32 s3, s15
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i128_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0xc
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x30
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s12
-; GFX10-NEXT: s_mov_b32 s1, s13
-; GFX10-NEXT: s_mov_b32 s2, s14
-; GFX10-NEXT: s_mov_b32 s3, s15
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x30
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s12
-; GFX11-NEXT: s_mov_b32 s1, s13
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i128>, ptr addrspace(4) %ptr
%element = extractelement <4 x i128> %vector, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
index 7028d1157787fe9..d21b8a383999cae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll
@@ -6,42 +6,74 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s2, s4, 1
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_and_b32 s1, s4, 1
-; GCN-NEXT: s_lshl_b32 s1, s1, 4
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 3
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_and_b32 s2, s4, 3
+; GFX7-NEXT: s_lshl_b32 s4, s2, 1
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s2, s4, 1
-; GFX10-NEXT: s_cmp_eq_u32 s2, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 1
-; GFX10-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 3
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s2, s4, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 1
-; GFX11-NEXT: s_lshl_b32 s1, s1, 4
+; GFX11-NEXT: s_and_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 %idx
@@ -51,71 +83,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(ptr addrspace(4) inreg
define amdgpu_ps i16 @extractelement_vgpr_v4i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 1
-; GFX9-NEXT: s_and_b32 s1, s2, 1
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_lshl_b32 s0, s1, 4
+; GFX9-NEXT: s_and_b32 s0, s2, 3
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 1
-; GFX8-NEXT: s_and_b32 s1, s2, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_lshl_b32 s0, s1, 4
+; GFX8-NEXT: s_and_b32 s0, s2, 3
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 1
-; GFX7-NEXT: s_and_b32 s1, s2, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_lshl_b32 s0, s1, 4
+; GFX7-NEXT: s_and_b32 s0, s2, 3
+; GFX7-NEXT: s_lshl_b32 s0, s0, 1
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 1
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT: s_and_b32 s0, s2, 1
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-NEXT: s_and_b32 s0, s2, 3
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v4i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX11-NEXT: s_and_b32 s0, s2, 1
-; GFX11-NEXT: s_lshl_b32 s0, s0, 4
+; GFX11-NEXT: s_and_b32 s0, s2, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(1) %ptr
@@ -127,70 +159,66 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX9-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 %idx
@@ -198,48 +226,74 @@ define i16 @extractelement_vgpr_v4i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
@@ -250,19 +304,19 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_vgpr_idx(ptr addrspace(4) inreg
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: extractelement_sgpr_v4i16_idx0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
@@ -271,25 +325,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx0(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 1
@@ -297,25 +376,34 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx1(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s0, s[2:3], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s1
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 2
@@ -323,25 +411,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx2(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v4i16_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i16_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 6
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i16_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i16_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i16_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:6
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i16>, ptr addrspace(4) %ptr
%element = extractelement <4 x i16> %vector, i32 3
@@ -352,14 +465,14 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -369,21 +482,21 @@ define i16 @extractelement_vgpr_v4i16_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
@@ -395,17 +508,17 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_idx1:
@@ -414,25 +527,22 @@ define i16 @extractelement_vgpr_v4i16_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 1
@@ -443,17 +553,17 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_idx2:
@@ -462,25 +572,22 @@ define i16 @extractelement_vgpr_v4i16_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 2
@@ -491,17 +598,17 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i16_idx3:
@@ -510,25 +617,22 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i16_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i16>, ptr addrspace(1) %ptr
%element = extractelement <4 x i16> %vector, i32 3
@@ -536,54 +640,74 @@ define i16 @extractelement_vgpr_v4i16_idx3(ptr addrspace(1) %ptr) {
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s5, s4, 1
-; GCN-NEXT: s_cmp_eq_u32 s5, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 2
-; GCN-NEXT: s_cselect_b32 s0, s2, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 3
-; GCN-NEXT: s_cselect_b32 s0, s3, s0
-; GCN-NEXT: s_and_b32 s1, s4, 1
-; GCN-NEXT: s_lshl_b32 s1, s1, 4
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 7
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 7
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_and_b32 s2, s4, 7
+; GFX7-NEXT: s_lshl_b32 s4, s2, 1
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s5, s4, 1
-; GFX10-NEXT: s_cmp_eq_u32 s5, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 2
-; GFX10-NEXT: s_cselect_b32 s0, s2, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 3
-; GFX10-NEXT: s_cselect_b32 s0, s3, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 1
-; GFX10-NEXT: s_lshl_b32 s1, s1, 4
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 7
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s5, s4, 1
+; GFX11-NEXT: s_and_b32 s0, s4, 7
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s5, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 2
-; GFX11-NEXT: s_cselect_b32 s0, s2, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: s_cselect_b32 s0, s3, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s1, s1, 4
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 %idx
@@ -593,92 +717,71 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_sgpr_idx(ptr addrspace(4) inreg
define amdgpu_ps i16 @extractelement_vgpr_v8i16_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 1
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s2, 1
+; GFX9-NEXT: s_and_b32 s0, s2, 7
+; GFX9-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT: s_lshl_b32 s0, s1, 4
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 1
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s2, 1
+; GFX8-NEXT: s_and_b32 s0, s2, 7
+; GFX8-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT: s_lshl_b32 s0, s1, 4
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 1
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s2, 1
+; GFX7-NEXT: s_and_b32 s0, s2, 7
+; GFX7-NEXT: s_lshl_b32 s0, s0, 1
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT: s_lshl_b32 s0, s1, 4
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 1
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT: s_and_b32 s0, s2, 7
+; GFX10-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT: s_and_b32 s0, s2, 1
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v8i16_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX11-NEXT: s_and_b32 s0, s2, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s0, s0, 1
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX11-NEXT: s_and_b32 s0, s2, 1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(1) %ptr
@@ -690,91 +793,66 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
; GFX9-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 1, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 %idx
@@ -782,64 +860,74 @@ define i16 @extractelement_vgpr_v8i16_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx)
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_and_b32_e32 v0, 7, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
@@ -850,19 +938,19 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_vgpr_idx(ptr addrspace(4) inreg
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: extractelement_sgpr_v8i16_idx0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
@@ -871,25 +959,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx0(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 1
@@ -897,25 +1010,34 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx1(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s0, s[2:3], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s1
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 2
@@ -923,25 +1045,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx2(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 6
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:6
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 3
@@ -949,25 +1096,34 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx3(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s2
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x8
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx4:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s0, s[2:3], 0x2
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x8
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 4
@@ -975,25 +1131,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx4(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx5:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:10
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 10
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:10
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx5:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s2, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:10
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:10
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 5
@@ -1001,25 +1182,34 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx5(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s3
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0xc
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0xc
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx6:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s0, s[2:3], 0x3
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx6:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0xc
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0xc
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s3
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 6
@@ -1027,25 +1217,50 @@ define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx6(ptr addrspace(4) inreg %ptr
}
define amdgpu_ps i16 @extractelement_sgpr_v8i16_idx7(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i16_idx7:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s3, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] offset:14
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 14
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i16_idx7:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i16_idx7:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] offset:14
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i16_idx7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s3, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:14
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i16>, ptr addrspace(4) %ptr
%element = extractelement <8 x i16> %vector, i32 7
@@ -1056,14 +1271,14 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1073,21 +1288,21 @@ define i16 @extractelement_vgpr_v8i16_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
@@ -1099,17 +1314,17 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx1:
@@ -1118,25 +1333,22 @@ define i16 @extractelement_vgpr_v8i16_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 1
@@ -1147,17 +1359,17 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx2:
@@ -1166,25 +1378,22 @@ define i16 @extractelement_vgpr_v8i16_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 2
@@ -1195,17 +1404,17 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx3:
@@ -1214,25 +1423,22 @@ define i16 @extractelement_vgpr_v8i16_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 3
@@ -1243,17 +1449,17 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx4:
@@ -1262,25 +1468,22 @@ define i16 @extractelement_vgpr_v8i16_idx4(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 4
@@ -1291,17 +1494,17 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:10
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx5:
@@ -1310,25 +1513,22 @@ define i16 @extractelement_vgpr_v8i16_idx5(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:10
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:10
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 5
@@ -1339,17 +1539,17 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx6:
@@ -1358,25 +1558,22 @@ define i16 @extractelement_vgpr_v8i16_idx6(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 6
@@ -1387,17 +1584,17 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i16_idx7:
@@ -1406,25 +1603,22 @@ define i16 @extractelement_vgpr_v8i16_idx7(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:14
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i16_idx7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:14
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i16>, ptr addrspace(1) %ptr
%element = extractelement <8 x i16> %vector, i32 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
index c7f49d526fac06a..5430a38b9cc65b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll
@@ -6,32 +6,68 @@
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_and_b32 s1, s4, 3
-; GCN-NEXT: s_lshl_b32 s1, s1, 3
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 3
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 3
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s4, s4, 3
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_and_b32 s1, s4, 3
-; GFX10-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 3
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_and_b32 s1, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s1, s1, 3
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 %idx
@@ -41,55 +77,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(ptr addrspace(4) inreg %p
define amdgpu_ps i8 @extractelement_vgpr_v4i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_and_b32 s0, s2, 3
-; GFX9-NEXT: s_lshl_b32 s0, s0, 3
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_and_b32 s0, s2, 3
-; GFX8-NEXT: s_lshl_b32 s0, s0, 3
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_and_b32 s0, s2, 3
-; GFX7-NEXT: s_lshl_b32 s0, s0, 3
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_and_b32 s0, s2, 3
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v4i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_and_b32 s0, s2, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_lshl_b32 s0, s0, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(1) %ptr
@@ -101,55 +147,60 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
-; GFX9-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
-; GFX10-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 %idx
@@ -159,53 +210,67 @@ define i8 @extractelement_vgpr_v4i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
define amdgpu_ps i8 @extractelement_sgpr_v4i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX9-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX8-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: v_lshr_b32_e32 v0, s0, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0
+; GFX10-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 3, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
@@ -237,25 +302,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx0(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 8
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 1
@@ -263,25 +353,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx1(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 2
@@ -289,25 +404,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx2(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v4i8_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v4i8_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 24
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 3
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v4i8_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v4i8_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v4i8_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <4 x i8>, ptr addrspace(4) %ptr
%element = extractelement <4 x i8> %vector, i32 3
@@ -318,14 +458,14 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -335,21 +475,21 @@ define i8 @extractelement_vgpr_v4i8_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
@@ -361,17 +501,17 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_idx1:
@@ -380,25 +520,22 @@ define i8 @extractelement_vgpr_v4i8_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 1
@@ -409,17 +546,17 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_idx2:
@@ -428,25 +565,22 @@ define i8 @extractelement_vgpr_v4i8_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 2
@@ -457,17 +591,17 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v0, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v4i8_idx3:
@@ -476,25 +610,22 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v4i8_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <4 x i8>, ptr addrspace(1) %ptr
%element = extractelement <4 x i8> %vector, i32 3
@@ -502,42 +633,68 @@ define i8 @extractelement_vgpr_v4i8_idx3(ptr addrspace(1) %ptr) {
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s2, s4, 2
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_and_b32 s1, s4, 3
-; GCN-NEXT: s_lshl_b32 s1, s1, 3
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 7
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 7
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s4, s4, 7
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s2, s4, 2
-; GFX10-NEXT: s_cmp_eq_u32 s2, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 3
-; GFX10-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 7
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s2, s4, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 3
-; GFX11-NEXT: s_lshl_b32 s1, s1, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s0, s4, 7
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 %idx
@@ -547,71 +704,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(ptr addrspace(4) inreg %p
define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 2
-; GFX9-NEXT: s_and_b32 s1, s2, 3
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_lshl_b32 s0, s1, 3
+; GFX9-NEXT: s_and_b32 s0, s2, 7
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 2
-; GFX8-NEXT: s_and_b32 s1, s2, 3
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_lshl_b32 s0, s1, 3
+; GFX8-NEXT: s_and_b32 s0, s2, 7
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 2
-; GFX7-NEXT: s_and_b32 s1, s2, 3
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_lshl_b32 s0, s1, 3
+; GFX7-NEXT: s_and_b32 s0, s2, 7
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 2
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX10-NEXT: s_and_b32 s0, s2, 3
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
+; GFX10-NEXT: s_and_b32 s0, s2, 7
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v8i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
-; GFX11-NEXT: s_and_b32 s0, s2, 3
-; GFX11-NEXT: s_lshl_b32 s0, s0, 3
+; GFX11-NEXT: s_and_b32 s0, s2, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(1) %ptr
@@ -623,70 +774,60 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 %idx
@@ -694,48 +835,69 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 3, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 7, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 7, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
@@ -746,19 +908,19 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(ptr addrspace(4) inreg %p
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr) {
; GCN-LABEL: extractelement_sgpr_v8i8_idx0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
@@ -767,25 +929,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx0(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 8
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 8
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 8
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 1
@@ -793,25 +980,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx1(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx2:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 2
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 2
@@ -819,25 +1031,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx2(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx3:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s0, 24
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 3
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx3:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx3:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s0, 24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:3
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:3
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 3
@@ -845,25 +1082,34 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx3(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx4:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[2:3], 0x4
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x4
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx4:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_load_dword s0, s[2:3], 0x1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x4
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s0, s1
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 4
@@ -871,25 +1117,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx4(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx5:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 8
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:5
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 5
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx5:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 8
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:5
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 8
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 5
@@ -897,25 +1168,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx5(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx6:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 16
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:6
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 6
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx6:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx6:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:6
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 6
@@ -923,25 +1219,50 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx6(ptr addrspace(4) inreg %ptr)
}
define amdgpu_ps i8 @extractelement_sgpr_v8i8_idx7(ptr addrspace(4) inreg %ptr) {
-; GCN-LABEL: extractelement_sgpr_v8i8_idx7:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s0, s1, 24
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:7
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_u32 s0, s2, 7
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v8i8_idx7:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:7
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v8i8_idx7:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_lshr_b32 s0, s1, 24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:7
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v8i8_idx7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s0, s1, 24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:7
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <8 x i8>, ptr addrspace(4) %ptr
%element = extractelement <8 x i8> %vector, i32 7
@@ -952,14 +1273,14 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -969,21 +1290,21 @@ define i8 @extractelement_vgpr_v8i8_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
@@ -995,17 +1316,17 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx1:
@@ -1014,25 +1335,22 @@ define i8 @extractelement_vgpr_v8i8_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 1
@@ -1043,17 +1361,17 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx2:
@@ -1062,25 +1380,22 @@ define i8 @extractelement_vgpr_v8i8_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 2
@@ -1091,17 +1406,17 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx3:
@@ -1110,25 +1425,22 @@ define i8 @extractelement_vgpr_v8i8_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 3
@@ -1139,17 +1451,17 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx4:
@@ -1158,25 +1470,22 @@ define i8 @extractelement_vgpr_v8i8_idx4(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 4
@@ -1187,17 +1496,17 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx5:
@@ -1206,25 +1515,22 @@ define i8 @extractelement_vgpr_v8i8_idx5(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 5
@@ -1235,17 +1541,17 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx6:
@@ -1254,25 +1560,22 @@ define i8 @extractelement_vgpr_v8i8_idx6(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 6
@@ -1283,17 +1586,17 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 7, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v8i8_idx7:
@@ -1302,25 +1605,22 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v8i8_idx7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:7
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <8 x i8>, ptr addrspace(1) %ptr
%element = extractelement <8 x i8> %vector, i32 7
@@ -1328,54 +1628,68 @@ define i8 @extractelement_vgpr_v8i8_idx7(ptr addrspace(1) %ptr) {
}
define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
-; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: s_lshr_b32 s5, s4, 2
-; GCN-NEXT: s_cmp_eq_u32 s5, 1
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cselect_b32 s0, s1, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 2
-; GCN-NEXT: s_cselect_b32 s0, s2, s0
-; GCN-NEXT: s_cmp_eq_u32 s5, 3
-; GCN-NEXT: s_cselect_b32 s0, s3, s0
-; GCN-NEXT: s_and_b32 s1, s4, 3
-; GCN-NEXT: s_lshl_b32 s1, s1, 3
-; GCN-NEXT: s_lshr_b32 s0, s0, s1
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s4, 15
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, s3, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s4, 15
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_add_u32 s0, s2, s0
+; GFX8-NEXT: s_addc_u32 s1, s3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s4, s4, 15
+; GFX7-NEXT: s_ashr_i32 s5, s4, 31
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: s_lshr_b32 s5, s4, 2
-; GFX10-NEXT: s_cmp_eq_u32 s5, 1
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cselect_b32 s0, s1, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 2
-; GFX10-NEXT: s_cselect_b32 s0, s2, s0
-; GFX10-NEXT: s_cmp_eq_u32 s5, 3
-; GFX10-NEXT: s_cselect_b32 s0, s3, s0
-; GFX10-NEXT: s_and_b32 s1, s4, 3
-; GFX10-NEXT: s_lshl_b32 s1, s1, 3
-; GFX10-NEXT: s_lshr_b32 s0, s0, s1
+; GFX10-NEXT: s_and_b32 s0, s4, 15
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_add_u32 s0, s2, s0
+; GFX10-NEXT: s_addc_u32 s1, s3, s1
+; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s5, s4, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_eq_u32 s5, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cselect_b32 s0, s1, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 2
-; GFX11-NEXT: s_cselect_b32 s0, s2, s0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: s_cselect_b32 s0, s3, s0
-; GFX11-NEXT: s_and_b32 s1, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s1, s1, 3
-; GFX11-NEXT: s_lshr_b32 s0, s0, s1
+; GFX11-NEXT: s_and_b32 s0, s4, 15
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <16 x i8>, ptr addrspace(4) %ptr
%element = extractelement <16 x i8> %vector, i32 %idx
@@ -1385,92 +1699,65 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(ptr addrspace(4) inreg %
define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
; GFX9-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT: s_lshr_b32 s0, s2, 2
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX9-NEXT: s_and_b32 s1, s2, 3
+; GFX9-NEXT: s_and_b32 s0, s2, 15
+; GFX9-NEXT: s_ashr_i32 s1, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT: s_lshl_b32 s0, s1, 3
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: s_lshr_b32 s0, s2, 2
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX8-NEXT: s_and_b32 s1, s2, 3
+; GFX8-NEXT: s_and_b32 s0, s2, 15
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT: s_lshl_b32 s0, s1, 3
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX7: ; %bb.0:
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: s_lshr_b32 s0, s2, 2
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
-; GFX7-NEXT: s_and_b32 s1, s2, 3
+; GFX7-NEXT: s_and_b32 s0, s2, 15
+; GFX7-NEXT: s_ashr_i32 s1, s0, 31
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 3
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX7-NEXT: s_lshl_b32 s0, s1, 3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
-; GFX10-NEXT: s_lshr_b32 s0, s2, 2
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX10-NEXT: s_and_b32 s0, s2, 15
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX10-NEXT: s_and_b32 s0, s2, 3
-; GFX10-NEXT: s_lshl_b32 s0, s0, 3
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_vgpr_v16i8_sgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
-; GFX11-NEXT: s_lshr_b32 s0, s2, 2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
+; GFX11-NEXT: s_and_b32 s0, s2, 15
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3
-; GFX11-NEXT: s_and_b32 s0, s2, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: s_lshl_b32 s0, s0, 3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1482,91 +1769,60 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
; GFX9-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 3, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_vgpr_idx:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 %idx
@@ -1574,64 +1830,69 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
}
define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
-; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT: v_and_b32_e32 v0, 3, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1
-; GCN-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: ; return to shader part epilog
+; GFX9-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_and_b32_e32 v0, 15, v0
+; GFX7-NEXT: s_mov_b32 s0, s2
+; GFX7-NEXT: s_mov_b32 s1, s3
+; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX7-NEXT: s_mov_b32 s2, 0
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0
-; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
+; GFX11-NEXT: v_and_b32_e32 v2, 15, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%vector = load <16 x i8>, ptr addrspace(4) %ptr
@@ -1643,14 +1904,14 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1660,21 +1921,21 @@ define i8 @extractelement_vgpr_v16i8_idx0(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
@@ -1686,17 +1947,17 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx1:
@@ -1705,25 +1966,22 @@ define i8 @extractelement_vgpr_v16i8_idx1(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 1
@@ -1734,17 +1992,17 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx2:
@@ -1753,25 +2011,22 @@ define i8 @extractelement_vgpr_v16i8_idx2(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 2
@@ -1782,17 +2037,17 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 3, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx3:
@@ -1801,25 +2056,22 @@ define i8 @extractelement_vgpr_v16i8_idx3(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:3
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:3
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 3
@@ -1830,17 +2082,17 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx4:
@@ -1849,25 +2101,22 @@ define i8 @extractelement_vgpr_v16i8_idx4(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:4
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 4
@@ -1878,17 +2127,17 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx5:
@@ -1897,25 +2146,22 @@ define i8 @extractelement_vgpr_v16i8_idx5(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:5
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:5
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx5:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:5
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 5
@@ -1926,17 +2172,17 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx6:
@@ -1945,25 +2191,22 @@ define i8 @extractelement_vgpr_v16i8_idx6(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:6
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx6:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:6
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 6
@@ -1974,17 +2217,17 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 7, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx7:
@@ -1993,25 +2236,22 @@ define i8 @extractelement_vgpr_v16i8_idx7(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:7
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:7
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:7
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 7
@@ -2022,17 +2262,17 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx8:
@@ -2041,25 +2281,22 @@ define i8 @extractelement_vgpr_v16i8_idx8(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:8
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 8
@@ -2070,17 +2307,17 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:9
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx9:
@@ -2089,25 +2326,22 @@ define i8 @extractelement_vgpr_v16i8_idx9(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:9
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:9
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx9:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:9
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 9
@@ -2118,17 +2352,17 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:10
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 10, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx10:
@@ -2137,25 +2371,22 @@ define i8 @extractelement_vgpr_v16i8_idx10(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:10
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:10
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 10
@@ -2166,17 +2397,17 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:11
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 11, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx11:
@@ -2185,25 +2416,22 @@ define i8 @extractelement_vgpr_v16i8_idx11(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:11
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx11:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:11
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 11
@@ -2214,17 +2442,17 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx12:
@@ -2233,25 +2461,22 @@ define i8 @extractelement_vgpr_v16i8_idx12(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:12
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx12:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:12
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 12
@@ -2262,17 +2487,17 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:13
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 13, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx13:
@@ -2281,25 +2506,22 @@ define i8 @extractelement_vgpr_v16i8_idx13(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:13
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:13
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx13:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:13
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 13
@@ -2310,17 +2532,17 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:14
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 14, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx14:
@@ -2329,25 +2551,22 @@ define i8 @extractelement_vgpr_v16i8_idx14(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:14
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx14:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:14
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 14
@@ -2358,17 +2577,17 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
; GFX9-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:15
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 15, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: extractelement_vgpr_v16i8_idx15:
@@ -2377,25 +2596,22 @@ define i8 @extractelement_vgpr_v16i8_idx15(ptr addrspace(1) %ptr) {
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:15
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
+; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:15
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: extractelement_vgpr_v16i8_idx15:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:15
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vector = load <16 x i8>, ptr addrspace(1) %ptr
%element = extractelement <16 x i8> %vector, i32 15
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 8bf34caea40513d..d4c536bdd5ebe14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -4738,35 +4738,31 @@ define i32 @v_extract_v64i32_7(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_7:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:28
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, v7
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: v_extract_v64i32_7:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 28, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v0, v7
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_7:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:28
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_7:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:28
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 7
@@ -4777,7 +4773,7 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_32:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:128
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
@@ -4786,21 +4782,21 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:128
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
@@ -4812,35 +4808,31 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_33:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:132
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, v1
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: v_extract_v64i32_33:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x84, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v0, v1
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_33:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:132
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_33:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:132
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 33
@@ -4851,35 +4843,31 @@ define i32 @v_extract_v64i32_37(ptr addrspace(1) %ptr) {
; GPRIDX-LABEL: v_extract_v64i32_37:
; GPRIDX: ; %bb.0:
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144
+; GPRIDX-NEXT: global_load_dword v0, v[0:1], off offset:148
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
-; GPRIDX-NEXT: v_mov_b32_e32 v0, v5
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: v_extract_v64i32_37:
; MOVREL: ; %bb.0:
; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x90, v0
+; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 0x94, v0
; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
+; MOVREL-NEXT: flat_load_dword v0, v[0:1]
; MOVREL-NEXT: s_waitcnt vmcnt(0)
-; MOVREL-NEXT: v_mov_b32_e32 v0, v5
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_extract_v64i32_37:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:144
+; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:148
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_extract_v64i32_37:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:144
+; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:148
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
%vec = load <64 x i32>, ptr addrspace(1) %ptr
%elt = extractelement <64 x i32> %vec, i32 37
More information about the llvm-commits
mailing list