[llvm] AMDGPU/GlobalISel: RegBankLegalize rules for G_EXTRACT_VECTOR_ELT (PR #189144)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 27 21:54:38 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel
Author: vangthao95
<details>
<summary>Changes</summary>
---
Patch is 179.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/189144.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+91)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+15)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+481-681)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir (+508-344)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 46ff5342a7dd9..0e0a02a094001 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -890,6 +890,93 @@ bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
return true;
}
+bool RegBankLegalizeHelper::lowerExtrVecEltToSel(MachineInstr &MI) {
+ // Lower extract vector element to a compare-select chain:
+ // result = elt[0]
+ // for i in 1..N-1:
+ // result = (idx == i) ? elt[i] : result
+ //
+ // When the index is divergent, each lane may want a different element, so
+ // we must check every element per lane.
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ Register Idx = MI.getOperand(2).getReg();
+
+ LLT VecTy = MRI.getType(Src);
+ LLT ScalarTy = VecTy.getScalarType();
+ unsigned NumElts = VecTy.getNumElements();
+
+ SmallVector<Register, 16> Elts;
+ for (unsigned I = 0; I < NumElts; ++I)
+ Elts.push_back(MRI.createVirtualRegister({VgprRB, ScalarTy}));
+
+ B.buildUnmerge(Elts, Src);
+ Register PrevSelect = Elts[0];
+ for (unsigned I = 1; I < NumElts; ++I) {
+ auto IdxConst = B.buildConstant({SgprRB, MRI.getType(Idx)}, I);
+ auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, {VccRB, S1}, Idx, IdxConst);
+
+ MachineInstrBuilder Sel;
+ if (ScalarTy.getSizeInBits() == 32) {
+ Sel = B.buildSelect({VgprRB, ScalarTy}, Cmp, Elts[I], PrevSelect);
+ } else if (ScalarTy.getSizeInBits() == 64) {
+ auto EltsUnmerge = B.buildUnmerge({VgprRB, S32}, Elts[I]);
+ auto PrevSelectUnmerge = B.buildUnmerge({VgprRB, S32}, PrevSelect);
+ auto SelLo = B.buildSelect({VgprRB, S32}, Cmp, EltsUnmerge.getReg(0),
+ PrevSelectUnmerge.getReg(0));
+ auto SelHi = B.buildSelect({VgprRB, S32}, Cmp, EltsUnmerge.getReg(1),
+ PrevSelectUnmerge.getReg(1));
+ Sel = B.buildMergeValues({VgprRB, ScalarTy},
+ {SelLo.getReg(0), SelHi.getReg(0)});
+ } else {
+ llvm_unreachable(
+ "expected s32 or s64 element type for extract vector elt");
+ }
+
+ PrevSelect = Sel.getReg(0);
+ }
+
+ B.buildCopy(Dst, PrevSelect);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool RegBankLegalizeHelper::lowerExtrVecEltTo32(MachineInstr &MI) {
+ // Reduce a 64-bit element extract to two 32-bit extracts:
+ // vec32 = bitcast <N x s64> to <2N x s32>
+ // lo = vec32[idx * 2]
+ // hi = vec32[idx * 2 + 1]
+ // result = merge(lo, hi)
+ //
+ // When the index is uniform, all lanes extract the same element, so we can
+ // just split the s64 extract into two s32 extracts which lower to MOVREL,
+ // avoiding the O(N) compare-select chain in ExtrVecEltToSel.
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ Register Idx = MI.getOperand(2).getReg();
+
+ LLT SrcTy = MRI.getType(Src);
+ LLT Vec32Ty = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
+
+ const RegisterBank *SrcRB = MRI.getRegBank(Src);
+ const RegisterBank *IdxRB = MRI.getRegBank(Idx);
+
+ auto CastSrc = B.buildBitcast({SrcRB, Vec32Ty}, Src);
+
+ // Calculate new Lo and Hi indices
+ auto One = B.buildConstant({SgprRB, S32}, 1);
+ auto IdxLo = B.buildShl({IdxRB, S32}, Idx, One);
+ auto IdxHi = B.buildAdd({IdxRB, S32}, IdxLo, One);
+
+ auto ExtLo = B.buildExtractVectorElement({VgprRB, S32}, CastSrc, IdxLo);
+ auto ExtHi = B.buildExtractVectorElement({VgprRB, S32}, CastSrc, IdxHi);
+
+ B.buildMergeLikeInstr(Dst, {ExtLo.getReg(0), ExtHi.getReg(0)});
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool RegBankLegalizeHelper::lower(MachineInstr &MI,
const RegBankLLTMapping &Mapping,
WaterfallInfo &WFI) {
@@ -1161,6 +1248,10 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
return applyRegisterBanksINTRIN_IMAGE(MI);
case SplitBitCount64To32:
return lowerSplitBitCount64To32(MI);
+ case ExtrVecEltToSel:
+ return lowerExtrVecEltToSel(MI);
+ case ExtrVecEltTo32:
+ return lowerExtrVecEltTo32(MI);
}
if (!WFI.SgprWaterfallOperandRegs.empty()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 577c26e4bf02a..8c39ad3238b18 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -140,6 +140,8 @@ class RegBankLegalizeHelper {
bool lowerSplitBitCount64To32(MachineInstr &MI);
bool lowerUnpackMinMax(MachineInstr &MI);
bool lowerUnpackAExt(MachineInstr &MI);
+ bool lowerExtrVecEltToSel(MachineInstr &MI);
+ bool lowerExtrVecEltTo32(MachineInstr &MI);
bool applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 198e52a6f9ae2..a39f9f152e823 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -670,6 +670,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
.Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
+ addRulesForGOpcs({G_EXTRACT_VECTOR_ELT})
+ .Any({{UniB32, UniBRC, UniS32}, {{SgprB32}, {SgprBRC, Sgpr32}}})
+ .Any({{DivB32, DivBRC, UniS32}, {{VgprB32}, {VgprBRC, Sgpr32}}})
+ .Any({{DivB32, UniBRC, DivS32},
+ {{VgprB32}, {VgprBRC, Vgpr32}, ExtrVecEltToSel}})
+ .Any({{DivB32, DivBRC, DivS32},
+ {{VgprB32}, {VgprBRC, Vgpr32}, ExtrVecEltToSel}})
+ .Any({{UniB64, UniBRC, UniS32}, {{SgprB64}, {SgprBRC, Sgpr32}}})
+ .Any({{DivB64, DivBRC, UniS32},
+ {{VgprB64}, {VgprBRC, Sgpr32}, ExtrVecEltTo32}})
+ .Any({{DivB64, UniBRC, DivS32},
+ {{VgprB64}, {VgprBRC, Vgpr32}, ExtrVecEltToSel}})
+ .Any({{DivB64, DivBRC, DivS32},
+ {{VgprB64}, {VgprBRC, Vgpr32}, ExtrVecEltToSel}});
+
// LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
// LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
// LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index d8a88b0ee558b..d27bed8333e26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -290,7 +290,9 @@ enum LoweringMethodID {
VerifyAllSgprGPHI,
VerifyAllSgprOrVgprGPHI,
ApplyINTRIN_IMAGE,
- SplitBitCount64To32
+ SplitBitCount64To32,
+ ExtrVecEltToSel,
+ ExtrVecEltTo32
};
enum FastRulesTypes {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 206011adf0213..cdbac5f38bd68 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,MOVREL %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GPRIDX %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,MOVREL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
; GCN-LABEL: dyn_extract_v8f32_const_s_v:
@@ -53,39 +53,20 @@ entry:
}
define amdgpu_ps float @dyn_extract_v8f32_const_s_s(i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_const_s_s:
-; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 1
-; GPRIDX-NEXT: s_cselect_b32 s0, 2.0, 1.0
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 2
-; GPRIDX-NEXT: s_cselect_b32 s0, 0x40400000, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 3
-; GPRIDX-NEXT: s_cselect_b32 s0, 4.0, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 4
-; GPRIDX-NEXT: s_cselect_b32 s0, 0x40a00000, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 5
-; GPRIDX-NEXT: s_cselect_b32 s0, 0x40c00000, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 6
-; GPRIDX-NEXT: s_cselect_b32 s0, 0x40e00000, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s2, 7
-; GPRIDX-NEXT: s_cselect_b32 s0, 0x41000000, s0
-; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT: ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_const_s_s:
-; MOVREL: ; %bb.0: ; %entry
-; MOVREL-NEXT: s_mov_b32 s4, 1.0
-; MOVREL-NEXT: s_mov_b32 m0, s2
-; MOVREL-NEXT: s_mov_b32 s11, 0x41000000
-; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000
-; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000
-; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000
-; MOVREL-NEXT: s_mov_b32 s7, 4.0
-; MOVREL-NEXT: s_mov_b32 s6, 0x40400000
-; MOVREL-NEXT: s_mov_b32 s5, 2.0
-; MOVREL-NEXT: s_movrels_b32 s0, s4
-; MOVREL-NEXT: v_mov_b32_e32 v0, s0
-; MOVREL-NEXT: ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_const_s_s:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_mov_b32 s4, 1.0
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_mov_b32 s11, 0x41000000
+; GCN-NEXT: s_mov_b32 s10, 0x40e00000
+; GCN-NEXT: s_mov_b32 s9, 0x40c00000
+; GCN-NEXT: s_mov_b32 s8, 0x40a00000
+; GCN-NEXT: s_mov_b32 s7, 4.0
+; GCN-NEXT: s_mov_b32 s6, 0x40400000
+; GCN-NEXT: s_mov_b32 s5, 2.0
+; GCN-NEXT: s_movrels_b32 s0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: dyn_extract_v8f32_const_s_s:
; GFX10PLUS: ; %bb.0: ; %entry
@@ -202,20 +183,9 @@ entry:
define amdgpu_ps float @dyn_extract_v8f32_v_s(<8 x float> %vec, i32 inreg %sel) {
; GPRIDX-LABEL: dyn_extract_v8f32_v_s:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 4
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7
-; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GPRIDX-NEXT: v_mov_b32_e32 v0, v0
+; GPRIDX-NEXT: s_set_gpr_idx_off
; GPRIDX-NEXT: ; return to shader part epilog
;
; MOVREL-LABEL: dyn_extract_v8f32_v_s:
@@ -235,39 +205,20 @@ entry:
}
define amdgpu_ps float @dyn_extract_v8f32_s_s(<8 x float> inreg %vec, i32 inreg %sel) {
-; GPRIDX-LABEL: dyn_extract_v8f32_s_s:
-; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 1
-; GPRIDX-NEXT: s_cselect_b32 s0, s3, s2
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 2
-; GPRIDX-NEXT: s_cselect_b32 s0, s4, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 3
-; GPRIDX-NEXT: s_cselect_b32 s0, s5, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 4
-; GPRIDX-NEXT: s_cselect_b32 s0, s6, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 5
-; GPRIDX-NEXT: s_cselect_b32 s0, s7, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 6
-; GPRIDX-NEXT: s_cselect_b32 s0, s8, s0
-; GPRIDX-NEXT: s_cmp_eq_u32 s10, 7
-; GPRIDX-NEXT: s_cselect_b32 s0, s9, s0
-; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT: ; return to shader part epilog
-;
-; MOVREL-LABEL: dyn_extract_v8f32_s_s:
-; MOVREL: ; %bb.0: ; %entry
-; MOVREL-NEXT: s_mov_b32 s0, s2
-; MOVREL-NEXT: s_mov_b32 m0, s10
-; MOVREL-NEXT: s_mov_b32 s1, s3
-; MOVREL-NEXT: s_mov_b32 s2, s4
-; MOVREL-NEXT: s_mov_b32 s3, s5
-; MOVREL-NEXT: s_mov_b32 s4, s6
-; MOVREL-NEXT: s_mov_b32 s5, s7
-; MOVREL-NEXT: s_mov_b32 s6, s8
-; MOVREL-NEXT: s_mov_b32 s7, s9
-; MOVREL-NEXT: s_movrels_b32 s0, s0
-; MOVREL-NEXT: v_mov_b32_e32 v0, s0
-; MOVREL-NEXT: ; return to shader part epilog
+; GCN-LABEL: dyn_extract_v8f32_s_s:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_mov_b32 s0, s2
+; GCN-NEXT: s_mov_b32 m0, s10
+; GCN-NEXT: s_mov_b32 s1, s3
+; GCN-NEXT: s_mov_b32 s2, s4
+; GCN-NEXT: s_mov_b32 s3, s5
+; GCN-NEXT: s_mov_b32 s4, s6
+; GCN-NEXT: s_mov_b32 s5, s7
+; GCN-NEXT: s_mov_b32 s6, s8
+; GCN-NEXT: s_mov_b32 s7, s9
+; GCN-NEXT: s_movrels_b32 s0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: dyn_extract_v8f32_s_s:
; GFX10PLUS: ; %bb.0: ; %entry
@@ -292,123 +243,42 @@ define i64 @dyn_extract_v8i64_const_s_v(i32 %sel) {
; GCN-LABEL: dyn_extract_v8i64_const_s_v:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[16:17], 2
-; GCN-NEXT: s_mov_b64 s[18:19], 1
-; GCN-NEXT: s_mov_b64 s[14:15], 3
-; GCN-NEXT: v_mov_b32_e32 v1, s18
-; GCN-NEXT: v_mov_b32_e32 v2, s19
-; GCN-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NEXT: v_mov_b32_e32 v4, s17
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT: s_mov_b64 s[12:13], 4
-; GCN-NEXT: v_mov_b32_e32 v5, s14
-; GCN-NEXT: v_mov_b32_e32 v6, s15
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, 1, 2, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
-; GCN-NEXT: s_mov_b64 s[10:11], 5
-; GCN-NEXT: v_mov_b32_e32 v7, s12
-; GCN-NEXT: v_mov_b32_e32 v8, s13
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 3, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
-; GCN-NEXT: s_mov_b64 s[8:9], 6
-; GCN-NEXT: v_mov_b32_e32 v9, s10
-; GCN-NEXT: v_mov_b32_e32 v10, s11
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
-; GCN-NEXT: s_mov_b64 s[6:7], 7
-; GCN-NEXT: v_mov_b32_e32 v11, s8
-; GCN-NEXT: v_mov_b32_e32 v12, s9
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 5, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
-; GCN-NEXT: s_mov_b64 s[4:5], 8
-; GCN-NEXT: v_mov_b32_e32 v13, s6
-; GCN-NEXT: v_mov_b32_e32 v14, s7
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 6, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GCN-NEXT: v_mov_b32_e32 v15, s4
-; GCN-NEXT: v_mov_b32_e32 v16, s5
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 7, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
+; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 8, vcc
+; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: dyn_extract_v8i64_const_s_v:
-; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_mov_b64 s[4:5], 2
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: v_mov_b32_e32 v2, s5
-; GFX10-NEXT: s_mov_b64 s[6:7], 1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v0
-; GFX10-NEXT: s_mov_b64 s[8:9], 3
-; GFX10-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, s7, v2, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: s_mov_b64 s[6:7], 4
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v0
-; GFX10-NEXT: s_mov_b64 s[8:9], 5
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT: s_mov_b64 s[6:7], 6
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, s4
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0
-; GFX10-NEXT: s_mov_b64 s[8:9], 7
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10-NEXT: s_mov_b64 s[6:7], 8
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s6, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s7, vcc_lo
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: dyn_extract_v8i64_const_s_v:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b64 s[0:1], 2
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
-; GFX11-NEXT: s_mov_b64 s[2:3], 1
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0
-; GFX11-NEXT: s_mov_b64 s[4:5], 3
-; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v2, s3, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: s_mov_b64 s[2:3], 4
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v0
-; GFX11-NEXT: s_mov_b64 s[4:5], 5
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT: s_mov_b64 s[2:3], 6
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v0
-; GFX11-NEXT: s_mov_b64 s[4:5], 7
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_mov_b64 s[2:3], 8
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX10PLUS-LABEL: dyn_extract_v8i64_const_s_v:
+; GFX10PLUS: ; %bb.0: ; %entry
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 1, 2, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 3, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 4, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 5, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 6, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 7, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, 8, vcc_lo
+; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
entry...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/189144
More information about the llvm-commits
mailing list