[llvm] [GISel] Combine vector load followed by an extractelement (PR #72670)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 17 08:05:18 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Pranav Taneja (prtaneja)
<details>
<summary>Changes</summary>
Narrow a vector load followed by an extractelement instruction to an element level load.
Similar to scalarizeExtractedVectorLoad function in the DAGCombiner.
---
Patch is 343.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72670.diff
10 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+5)
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+8-2)
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+134)
- (modified) llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll (+13-45)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll (+18-497)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll (+202-697)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll (+767-573)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll (+1034-818)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+19-31)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index ba72a3b71ffd70b..f51401754c3f3fa 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -196,6 +196,11 @@ class CombinerHelper {
/// Match (and (load x), mask) -> zextload x
bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+ /// Combine a G_EXTRACT_VECTOR_ELT of a load into a narrowed
+ /// load.
+ bool matchCombineExtractedVectorLoad(MachineInstr &MI);
+ void applyCombineExtractedVectorLoad(MachineInstr &MI);
+
bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 76b83cc5df073ae..ee8dac06b6a4328 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -259,6 +259,12 @@ def sext_inreg_to_zext_inreg : GICombineRule<
}])
>;
+def combine_extracted_vector_load : GICombineRule<
+ (defs root:$root),
+ (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+ [{ return Helper.matchCombineExtractedVectorLoad(*${root}); }]),
+ (apply [{ Helper.applyCombineExtractedVectorLoad(*${root}); }])>;
+
def combine_indexed_load_store : GICombineRule<
(defs root:$root, indexed_load_store_matchdata:$matchinfo),
(match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -1283,8 +1289,8 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
constant_fold_fp_binop]>;
def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
- extract_vec_elt_combines, combines_for_extload,
- undef_combines, identity_combines, phi_combines,
+ extract_vec_elt_combines, combines_for_extload, combine_extracted_vector_load,
+ undef_combines, identity_combines, phi_combines,
simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
reassocs, ptr_add_immed_chain,
shl_ashr_to_sext_inreg, sext_inreg_of_load,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 51c268ab77c2220..b0018b990067afb 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1165,6 +1165,140 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
return RealUse;
}
+bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+
+ // Check if there is a load that defines the vector being extracted from.
+ MachineInstr *LoadMI =
+ getOpcodeDef(TargetOpcode::G_LOAD, MI.getOperand(1).getReg(), MRI);
+ if (!LoadMI)
+ return false;
+
+ Register Vector = MI.getOperand(1).getReg();
+ LLT VecEltVT = MRI.getType(Vector).getElementType();
+ LLT ResultVT = MRI.getType(MI.getOperand(0).getReg());
+
+ // Do not combine when result type and vector element type are not the same.
+ if (ResultVT != VecEltVT)
+ return false;
+
+ // Checking whether we should reduce the load width.
+ if (VecEltVT.isVector() || !MRI.hasOneUse(Vector))
+ return false;
+
+ GLoadStore *GLoadMI = cast<GLoadStore>(LoadMI);
+
+ // Check if the defining load is simple.
+ if (!GLoadMI->isSimple())
+ return false;
+
+ // If the vector element type is not a multiple of a byte then we are unable
+ // to correctly compute an address to load only the extracted element as a
+ // scalar.
+ if (!VecEltVT.isByteSized())
+ return false;
+
+ // Check if the new load that we are going to create is legal
+ // if we are in the post-legalization phase.
+ MachineMemOperand MMO = GLoadMI->getMMO();
+ MachinePointerInfo PtrInfo;
+
+ Register Index = MI.getOperand(2).getReg();
+
+ // Finding the appropriate PtrInfo if offset is a known constant.
+ // This is required to create the memory operand for the narrowed load.
+ // This machine memory operand object helps us infer about legality
+ // before we proceed to combine the instruction.
+ if (MRI.getVRegDef(Index)->getOpcode() == TargetOpcode::G_CONSTANT) {
+ MachineInstr *ConstMI = MRI.getVRegDef(Index);
+ const ConstantInt *CVal = ConstMI->getOperand(1).getCImm();
+ int Elt = CVal->getZExtValue();
+ // FIXME: should be (ABI size)*Elt.
+ unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+ PtrInfo = MMO.getPointerInfo().getWithOffset(PtrOff);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ PtrInfo = MachinePointerInfo(MMO.getPointerInfo().getAddrSpace());
+ }
+
+ Register VecPtr = GLoadMI->getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(VecPtr);
+
+ MachineFunction &MF = *MI.getMF();
+ auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, VecEltVT);
+
+ LegalityQuery::MemDesc MMDesc(*NewMMO);
+
+ LegalityQuery Q = {TargetOpcode::G_LOAD, {VecEltVT, PtrTy}, {MMDesc}};
+
+ if (!isLegalOrBeforeLegalizer(Q))
+ return false;
+
+ // Load must be allowed and fast on the target.
+ LLVMContext &C = MF.getFunction().getContext();
+ auto &DL = MF.getDataLayout();
+ unsigned Fast = 0;
+ if (!getTargetLowering().allowsMemoryAccess(C, DL, VecEltVT, *NewMMO,
+ &Fast) ||
+ !Fast)
+ return false;
+
+ return true;
+}
+
+void CombinerHelper::applyCombineExtractedVectorLoad(MachineInstr &MI) {
+ assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+
+ // Get the original load instruction.
+ MachineInstr *LoadMI =
+ getOpcodeDef(TargetOpcode::G_LOAD, MI.getOperand(1).getReg(), MRI);
+
+ GLoadStore *GLoadMI = cast<GLoadStore>(LoadMI);
+
+ Register Index = MI.getOperand(2).getReg();
+ LLT VecEltVT = MRI.getType(GLoadMI->getOperand(0).getReg()).getElementType();
+ Register Result = MI.getOperand(0).getReg();
+
+ Align Alignment = GLoadMI->getMMO().getAlign();
+ uint64_t Offset;
+ MachinePointerInfo PtrInfo;
+
+ // Check if Index to extract element from is constant.
+ if (MRI.getVRegDef(Index)->getOpcode() == TargetOpcode::G_CONSTANT) {
+ MachineInstr *ConstMI = MRI.getVRegDef(Index);
+ const ConstantInt *CVal = ConstMI->getOperand(1).getCImm();
+ int Elt = CVal->getZExtValue();
+ // FIXME: should be (ABI size)*Elt.
+ Offset = VecEltVT.getSizeInBits() * Elt / 8;
+ PtrInfo = GLoadMI->getMMO().getPointerInfo().getWithOffset(Offset);
+ } else {
+ // Discard the pointer info except the address space because the memory
+ // operand can't represent this new access since the offset is variable.
+ Offset = VecEltVT.getSizeInBits() / 8;
+ PtrInfo =
+ MachinePointerInfo(GLoadMI->getMMO().getPointerInfo().getAddrSpace());
+ }
+
+ Alignment = commonAlignment(Alignment, Offset);
+
+ MachineOperand &BasePtr = GLoadMI->getOperand(1);
+ MachineIRBuilder MIRBuilder(MI);
+
+ // Get pointer to the vector element.
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(MIRBuilder.getMF(), DummyObserver, MIRBuilder);
+ Register finalPtr = Helper.getVectorElementPointer(
+ BasePtr.getReg(), MRI.getType(GLoadMI->getOperand(0).getReg()), Index);
+
+ // New G_LOAD instruction.
+ MIRBuilder.buildLoad(Result, finalPtr, PtrInfo, Alignment);
+
+ // Remove original Extract Element MI and GLOAD instructions.
+ MI.eraseFromParent();
+ GLoadMI->eraseFromParent();
+}
+
bool CombinerHelper::matchCombineIndexedLoadStore(
MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
auto &LdSt = cast<GLoadStore>(MI);
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 0d7620d1c883d68..7493afd672d4378 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14659,17 +14659,9 @@ define i8 @load_single_extract_variable_index_i8(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i8:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0xf
-; CHECK-GISEL-NEXT: lsl x10, x9, #1
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: sub x9, x10, x9
-; CHECK-GISEL-NEXT: ldrb w0, [x8, x9]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0xf
+; CHECK-GISEL-NEXT: ldrb w0, [x0, x8]
; CHECK-GISEL-NEXT: ret
%lv = load <16 x i8>, ptr %A
%e = extractelement <16 x i8> %lv, i32 %idx
@@ -14692,15 +14684,9 @@ define i16 @load_single_extract_variable_index_i16(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i16:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0x7
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldrh w0, [x8, x9, lsl #1]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0x7
+; CHECK-GISEL-NEXT: ldrh w0, [x0, x8, lsl #1]
; CHECK-GISEL-NEXT: ret
%lv = load <8 x i16>, ptr %A
%e = extractelement <8 x i16> %lv, i32 %idx
@@ -14717,15 +14703,9 @@ define i32 @load_single_extract_variable_index_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov w9, w1
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and x9, x9, #0x3
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, x9, lsl #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: mov w8, w1
+; CHECK-GISEL-NEXT: and x8, x8, #0x3
+; CHECK-GISEL-NEXT: ldr w0, [x0, x8, lsl #2]
; CHECK-GISEL-NEXT: ret
%lv = load <4 x i32>, ptr %A
%e = extractelement <4 x i32> %lv, i32 %idx
@@ -14779,14 +14759,8 @@ define i32 @load_single_extract_variable_index_masked_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and w9, w1, #0x3
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: and w8, w1, #0x3
+; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2]
; CHECK-GISEL-NEXT: ret
%idx.x = and i32 %idx, 3
%lv = load <4 x i32>, ptr %A
@@ -14803,14 +14777,8 @@ define i32 @load_single_extract_variable_index_masked2_i32(ptr %A, i32 %idx) {
;
; CHECK-GISEL-LABEL: load_single_extract_variable_index_masked2_i32:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: sub sp, sp, #16
-; CHECK-GISEL-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GISEL-NEXT: ldr q0, [x0]
-; CHECK-GISEL-NEXT: mov x8, sp
-; CHECK-GISEL-NEXT: and w9, w1, #0x1
-; CHECK-GISEL-NEXT: str q0, [sp]
-; CHECK-GISEL-NEXT: ldr w0, [x8, w9, uxtw #2]
-; CHECK-GISEL-NEXT: add sp, sp, #16
+; CHECK-GISEL-NEXT: and w8, w1, #0x1
+; CHECK-GISEL-NEXT: ldr w0, [x0, w8, uxtw #2]
; CHECK-GISEL-NEXT: ret
%idx.x = and i32 %idx, 1
%lv = load <4 x i32>, ptr %A
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
index 69346de9bb79805..80dc3dead35ab23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll
@@ -124,71 +124,71 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1)
; GFX9-LABEL: test_add_mul_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z:
; GFX10-UNSAFE: ; %bb.0: ; %.entry
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul float %x, %y
@@ -202,71 +202,71 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace
; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-CONTRACT: ; %bb.0: ; %.entry
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX9-UNSAFE: ; %bb.0: ; %.entry
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-UNSAFE-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0)
-; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v3
+; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2
; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v2, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-CONTRACT-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1
+; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
+; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: v_mac_f32_e32 v3, v0, v1
-; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/72670
More information about the llvm-commits
mailing list