[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Improve readanylane combines in regbanklegalize (PR #142789)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 4 08:24:09 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Petar Avramovic (petar-avramovic)
<details>
<summary>Changes</summary>
---
Patch is 22.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142789.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp (+118-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll (+2-23)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir (+19-59)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index ba661348ca5b5..b5fe0ed499255 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -137,7 +138,123 @@ class AMDGPURegBankLegalizeCombiner {
return {MatchMI, MatchMI->getOperand(1).getReg()};
}
+ std::tuple<GUnmerge *, int> tryMatchRALFromUnmerge(Register Src) {
+ auto *ReadAnyLane = MRI.getVRegDef(Src);
+ if (ReadAnyLane->getOpcode() == AMDGPU::G_AMDGPU_READANYLANE) {
+ Register RALSrc = ReadAnyLane->getOperand(1).getReg();
+ auto *UnMerge = getOpcodeDef<GUnmerge>(RALSrc, MRI);
+ if (UnMerge)
+ return {UnMerge, UnMerge->findRegisterDefOperandIdx(RALSrc, nullptr)};
+ }
+ return {nullptr, -1};
+ }
+
+ Register getReadAnyLaneSrc(Register Src) {
+ // Src = G_AMDGPU_READANYLANE RALSrc
+ auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+
+ // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc
+ // LoSgpr = G_AMDGPU_READANYLANE LoVgpr
+ // HiSgpr = G_AMDGPU_READANYLANE HiVgpr
+ // Src G_MERGE_VALUES LoSgpr, HiSgpr
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(Src, MRI);
+ if (Merge) {
+ unsigned NumElts = Merge->getNumSources();
+ auto [Unmerge, Idx] = tryMatchRALFromUnmerge(Merge->getSourceReg(0));
+ if (!Unmerge || Unmerge->getNumDefs() != NumElts || Idx != 0)
+ return {};
+
+ // check if all elements are from same unmerge and there is no shuffling
+ for (unsigned i = 1; i < NumElts; ++i) {
+ auto [UnmergeI, IdxI] = tryMatchRALFromUnmerge(Merge->getSourceReg(i));
+ if (UnmergeI != Unmerge || (unsigned)IdxI != i)
+ return {};
+ }
+ return Unmerge->getSourceReg();
+ }
+
+ // ..., VgprI, ... = G_UNMERGE_VALUES VgprLarge
+ // SgprI = G_AMDGPU_READANYLANE VgprI
+ // SgprLarge G_MERGE_VALUES ..., SgprI, ...
+ // ..., Src, ... = G_UNMERGE_VALUES SgprLarge
+ auto *UnMerge = getOpcodeDef<GUnmerge>(Src, MRI);
+ if (UnMerge) {
+ int Idx = UnMerge->findRegisterDefOperandIdx(Src, nullptr);
+ auto *Merge = getOpcodeDef<GMergeLikeInstr>(UnMerge->getSourceReg(), MRI);
+ if (Merge) {
+ auto [RAL, RALSrc] =
+ tryMatch(Merge->getSourceReg(Idx), AMDGPU::G_AMDGPU_READANYLANE);
+ if (RAL)
+ return RALSrc;
+ }
+ }
+
+ return {};
+ }
+
+ bool tryEliminateReadAnyLane(MachineInstr &Copy) {
+ Register Dst = Copy.getOperand(0).getReg();
+ Register Src = Copy.getOperand(1).getReg();
+ if (!Src.isVirtual())
+ return false;
+
+ Register RALDst = Src;
+ MachineInstr &SrcMI = *MRI.getVRegDef(Src);
+ if (SrcMI.getOpcode() == AMDGPU::G_BITCAST) {
+ RALDst = SrcMI.getOperand(1).getReg();
+ }
+
+ Register RALSrc = getReadAnyLaneSrc(RALDst);
+ if (!RALSrc)
+ return false;
+
+ if (Dst.isVirtual()) {
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc
+ // Dst = Copy Src
+ // ->
+ // Dst = RALSrc
+ MRI.replaceRegWith(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst
+ // Dst = Copy Src
+ // ->
+ // NewVgpr = G_BITCAST RALDst
+ // Dst = NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ MRI.replaceRegWith(Dst, Bitcast.getReg(0));
+ }
+ } else {
+ B.setInstr(Copy);
+ if (SrcMI.getOpcode() != AMDGPU::G_BITCAST) {
+ // Src = READANYLANE RALSrc
+ // $Dst = Copy Src
+ // ->
+ // $Dst = Copy RALSrc
+ B.buildCopy(Dst, RALSrc);
+ } else {
+ // RALDst = READANYLANE RALSrc
+ // Src = G_BITCAST RALDst
+ // Dst = Copy Src
+ // ->
+ // NewVgpr = G_BITCAST RALDst
+ // $Dst = Copy NewVgpr
+ auto Bitcast = B.buildBitcast({VgprRB, MRI.getType(Src)}, RALSrc);
+ B.buildCopy(Dst, Bitcast.getReg(0));
+ }
+ }
+
+ eraseInstr(Copy, MRI, nullptr);
+ return true;
+ }
+
void tryCombineCopy(MachineInstr &MI) {
+ if (tryEliminateReadAnyLane(MI))
+ return;
+
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
// Skip copies of physical registers.
@@ -160,24 +277,7 @@ class AMDGPURegBankLegalizeCombiner {
auto One = B.buildConstant({SgprRB, S32}, 1);
auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One);
B.buildInstr(AMDGPU::G_AMDGPU_COPY_VCC_SCC, {Dst}, {BoolSrc});
- cleanUpAfterCombine(MI, Trunc);
- return;
- }
-
- // Src = G_AMDGPU_READANYLANE RALSrc
- // Dst = COPY Src
- // ->
- // Dst = RALSrc
- if (MRI.getRegBankOrNull(Dst) == VgprRB &&
- MRI.getRegBankOrNull(Src) == SgprRB) {
- auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE);
- if (!RAL)
- return;
-
- assert(MRI.getRegBank(RALSrc) == VgprRB);
- MRI.replaceRegWith(Dst, RALSrc);
- cleanUpAfterCombine(MI, RAL);
- return;
+ eraseInstr(MI, MRI, nullptr);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
index 51b473f2d8994..5f72d3e2ab161 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.ll
@@ -20,8 +20,6 @@ define amdgpu_ps float @readanylane_to_physical_vgpr(ptr addrspace(1) inreg %ptr
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile float, ptr addrspace(1) %ptr
ret float %load
@@ -33,8 +31,6 @@ define amdgpu_ps void @readanylane_to_bitcast_to_virtual_vgpr(ptr addrspace(1) i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: v_mov_b32_e32 v1, s0
; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
@@ -49,8 +45,6 @@ define amdgpu_ps float @readanylane_to_bitcast_to_physical_vgpr(ptr addrspace(1)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile <2 x i16>, ptr addrspace(1) %ptr0
%bitcast = bitcast <2 x i16> %load to float
@@ -63,10 +57,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_to_virtual_vgpr(ptr addrspace(1
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile i64, ptr addrspace(1) %ptr0
@@ -85,10 +75,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_bitcast_to_virtual_vgpr(ptr add
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
@@ -109,9 +95,7 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_to_virtual_vgpr(ptr add
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
+; CHECK-NEXT: global_store_dword v2, v1, s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <2 x i32>, ptr addrspace(1) %ptr0
%extracted = extractelement <2 x i32> %load, i32 1
@@ -125,8 +109,7 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_to_physical_vgpr(ptr a
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v1
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile <2 x float>, ptr addrspace(1) %ptr0
%extracted = extractelement <2 x float> %load, i32 1
@@ -139,8 +122,6 @@ define amdgpu_ps void @unmerge_readanylane_merge_extract_bitcast_to_virtual_vgpr
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: global_store_dword v2, v0, s[2:3]
; CHECK-NEXT: s_endpgm
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
@@ -156,8 +137,6 @@ define amdgpu_ps float @unmerge_readanylane_merge_extract_bitcast_to_physical_vg
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: ; return to shader part epilog
%load = load volatile <4 x i16>, ptr addrspace(1) %ptr0
%extracted = shufflevector <4 x i16> %load, <4 x i16> %load, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
index 673cf1696e5e0..6490b7ee4ed23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/readanylane-combines.mir
@@ -46,8 +46,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s32), addrspace 1)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[LOAD]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_READANYLANE]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[LOAD]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -74,11 +73,9 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[LOAD]]
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[AMDGPU_READANYLANE]](<2 x s16>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[BITCAST]](s32)
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY5]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -106,8 +103,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s16>), addrspace 1)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[LOAD]]
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[AMDGPU_READANYLANE]](<2 x s16>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[LOAD]](<2 x s16>)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%0:sgpr(s32) = COPY $sgpr0
@@ -136,13 +132,8 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
- ; CHECK-NEXT: [[MV2:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s64) = COPY [[MV2]](s64)
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[COPY4]](s64), [[COPY5]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[LOAD]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -169,11 +160,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(s64) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (s64), addrspace 1)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](s64)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
- ; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(s64) = G_MERGE_VALUES [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV1]](s64)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -200,14 +187,9 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr(s32) = COPY $sgpr3
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s64) = COPY [[BITCAST]](s64)
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[COPY4]](s64), [[COPY5]](p1) :: (store (s64), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[LOAD]](<2 x s32>)
+ ; CHECK-NEXT: G_STORE [[BITCAST]](s64), [[COPY4]](p1) :: (store (s64), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -235,11 +217,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
- ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
- ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[LOAD]](<2 x s32>)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](s64)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
%0:sgpr(s32) = COPY $sgpr0
@@ -269,13 +247,8 @@ body: |
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32)
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
- ; CHECK-NEXT: G_STORE [[COPY4]](s32), [[COPY5]](p1) :: (store (s32), addrspace 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(p1) = COPY [[MV1]](p1)
+ ; CHECK-NEXT: G_STORE [[UV1]](s32), [[COPY4]](p1) :: (store (s32), addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -304,11 +277,7 @@ body: |
; CHECK-NEXT: [[MV:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<2 x s32>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<2 x s32>), addrspace 1)
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READANYLANE1:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[UV1]]
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<2 x s32>) = G_BUILD_VECTOR [[AMDGPU_READANYLANE]](s32), [[AMDGPU_READANYLANE1]](s32)
- ; CHECK-NEXT: [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: $vgpr0 = COPY [[UV3]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[UV1]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = COPY $sgpr1
@@ -337,14 +306,9 @@ body: |
; CHECK-NEXT: [[MV1:%[0-9]+]]:sgpr(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
; CHECK-NEXT: [[LOAD:%[0-9]+]]:vgpr(<4 x s16>) = G_LOAD [[MV]](p1) :: (volatile "amdgpu-noclobber" load (<4 x s16>), addrspace 1)
; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(<2 x s16>), [[UV1:%[0-9]+]]:vgpr(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
- ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(<2 x s16>) = G_AMDGPU_READANYLANE [[UV]]
- ; CHECK-NEXT: [[AMDGPU_READAN...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/142789
More information about the llvm-branch-commits
mailing list