[llvm] b25c7ca - [AMDGPU][GlobalISel] Widen the vector operand in G_BUILD/INSERT/EXTRACT_VECTOR
Mateja Marjanovic via llvm-commits
llvm-commits at lists.llvm.org
Wed May 3 08:15:06 PDT 2023
Author: Mateja Marjanovic
Date: 2023-05-03T17:14:38+02:00
New Revision: b25c7cafcbe1b52ea2d1ff5e5c2f13674b5f297d
URL: https://github.com/llvm/llvm-project/commit/b25c7cafcbe1b52ea2d1ff5e5c2f13674b5f297d
DIFF: https://github.com/llvm/llvm-project/commit/b25c7cafcbe1b52ea2d1ff5e5c2f13674b5f297d.diff
LOG: [AMDGPU][GlobalISel] Widen the vector operand in G_BUILD/INSERT/EXTRACT_VECTOR
Widen the vector operand type in G_BUILD_VECTOR, G_INSERT_VECTOR_ELT,
G_EXTRACT_VECTOR_ELT to the nearest larger RegClass.
Added:
Modified:
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index c61ed539a543a..5c5f843764a2d 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4835,6 +4835,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
Observer.changedInstr(MI);
return Legalized;
}
+ case TargetOpcode::G_EXTRACT_VECTOR_ELT:
case TargetOpcode::G_EXTRACT:
if (TypeIdx != 1)
return UnableToLegalize;
@@ -4843,6 +4844,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
Observer.changedInstr(MI);
return Legalized;
case TargetOpcode::G_INSERT:
+ case TargetOpcode::G_INSERT_VECTOR_ELT:
case TargetOpcode::G_FREEZE:
case TargetOpcode::G_FNEG:
case TargetOpcode::G_FABS:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 7cb8a95c983ab..296cc34ef1fc4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -131,6 +131,28 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+// Increase the number of vector elements to reach the next legal RegClass.
+static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const unsigned NumElts = Ty.getNumElements();
+ const unsigned EltSize = Ty.getElementType().getSizeInBits();
+ const unsigned MaxNumElts = MaxRegisterSize / EltSize;
+
+ assert(EltSize == 32 || EltSize == 64);
+ assert(Ty.getSizeInBits() < MaxRegisterSize);
+
+ unsigned NewNumElts;
+ // Find the nearest legal RegClass that is larger than the current type.
+ for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
+ if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
+ break;
+ }
+
+ return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
+ };
+}
+
static LLT getBitcastRegisterType(const LLT Ty) {
const unsigned Size = Ty.getSizeInBits();
@@ -215,6 +237,15 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
};
}
+// RegisterType that doesn't have a corresponding RegClass.
+static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ LLT Ty = Query.Types[TypeIdx];
+ return isRegisterType(Ty) &&
+ !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
+ };
+}
+
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
@@ -1455,10 +1486,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT VecTy = Query.Types[VecTypeIdx];
const LLT IdxTy = Query.Types[IdxTypeIdx];
const unsigned EltSize = EltTy.getSizeInBits();
+ const bool isLegalVecType =
+ !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
return (EltSize == 32 || EltSize == 64) &&
VecTy.getSizeInBits() % 32 == 0 &&
VecTy.getSizeInBits() <= MaxRegisterSize &&
- IdxTy.getSizeInBits() == 32;
+ IdxTy.getSizeInBits() == 32 &&
+ isLegalVecType;
})
.bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
bitcastToVectorElement32(VecTypeIdx))
@@ -1484,6 +1518,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(IdxTypeIdx, S32, S32)
.clampMaxNumElements(VecTypeIdx, S32, 32)
// TODO: Clamp elements for 64-bit vectors?
+ .moreElementsIf(
+ isIllegalRegisterType(VecTypeIdx),
+ moreElementsToNextExistingRegClass(VecTypeIdx))
// It should only be necessary with variable indexes.
// As a last resort, lower to the stack
.lower();
@@ -1538,7 +1575,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalForCartesianProduct(AllS64Vectors, {S64})
.clampNumElements(0, V16S32, V32S32)
.clampNumElements(0, V2S64, V16S64)
- .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
+ .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
+ .moreElementsIf(
+ isIllegalRegisterType(0),
+ moreElementsToNextExistingRegClass(0));
if (ST.hasScalarPackInsts()) {
BuildVector
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9d94344b752ac..5999fc94e934c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2501,31 +2501,31 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
static const TargetRegisterClass *
getAnyVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::VReg_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::VReg_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::VReg_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::VReg_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::VReg_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::VReg_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::VReg_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::VReg_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::VReg_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::VReg_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::VReg_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::VReg_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::VReg_1024RegClass;
return nullptr;
@@ -2533,31 +2533,31 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
static const TargetRegisterClass *
getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::VReg_64_Align2RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::VReg_96_Align2RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::VReg_128_Align2RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::VReg_160_Align2RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::VReg_192_Align2RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::VReg_224_Align2RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::VReg_256_Align2RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::VReg_288_Align2RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::VReg_320_Align2RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::VReg_352_Align2RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::VReg_384_Align2RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::VReg_512_Align2RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::VReg_1024_Align2RegClass;
return nullptr;
@@ -2567,9 +2567,9 @@ const TargetRegisterClass *
SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
if (BitWidth == 1)
return &AMDGPU::VReg_1RegClass;
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::VGPR_32RegClass;
return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
: getAnyVGPRClassForBitWidth(BitWidth);
@@ -2577,31 +2577,31 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
static const TargetRegisterClass *
getAnyAGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AReg_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AReg_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AReg_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AReg_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AReg_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AReg_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AReg_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AReg_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AReg_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AReg_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AReg_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AReg_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AReg_1024RegClass;
return nullptr;
@@ -2609,31 +2609,31 @@ getAnyAGPRClassForBitWidth(unsigned BitWidth) {
static const TargetRegisterClass *
getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AReg_64_Align2RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AReg_96_Align2RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AReg_128_Align2RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AReg_160_Align2RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AReg_192_Align2RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AReg_224_Align2RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AReg_256_Align2RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AReg_288_Align2RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AReg_320_Align2RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AReg_352_Align2RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AReg_384_Align2RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AReg_512_Align2RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AReg_1024_Align2RegClass;
return nullptr;
@@ -2641,9 +2641,9 @@ getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
const TargetRegisterClass *
SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::AGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::AGPR_32RegClass;
return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
: getAnyAGPRClassForBitWidth(BitWidth);
@@ -2651,31 +2651,31 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
static const TargetRegisterClass *
getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AV_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AV_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AV_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AV_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AV_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AV_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AV_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AV_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AV_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AV_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AV_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AV_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AV_1024RegClass;
return nullptr;
@@ -2683,31 +2683,31 @@ getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
static const TargetRegisterClass *
getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AV_64_Align2RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AV_96_Align2RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AV_128_Align2RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AV_160_Align2RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AV_192_Align2RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AV_224_Align2RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AV_256_Align2RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AV_288_Align2RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AV_320_Align2RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AV_352_Align2RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AV_384_Align2RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AV_512_Align2RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AV_1024_Align2RegClass;
return nullptr;
@@ -2715,9 +2715,9 @@ getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
const TargetRegisterClass *
SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::AV_32RegClass;
return ST.needsAlignedVGPRs()
? getAlignedVectorSuperClassForBitWidth(BitWidth)
@@ -2726,35 +2726,35 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::SGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::SReg_32RegClass;
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::SReg_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::SGPR_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::SGPR_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::SGPR_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::SGPR_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::SGPR_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::SGPR_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::SGPR_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::SGPR_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::SGPR_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::SGPR_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::SGPR_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::SGPR_1024RegClass;
return nullptr;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index e639fce9d690e..db4e678c74a41 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2768,8 +2768,13 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v13, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v14, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
+; GCN-NEXT: ; kill: def $vgpr15 killed $sgpr14 killed $exec
+; GCN-NEXT: ; kill: def $vgpr16 killed $sgpr15 killed $exec
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v16, vcc
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: v_readfirstlane_b32 s1, v1
; GCN-NEXT: ; return to shader part epilog
@@ -2808,8 +2813,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
@@ -2847,8 +2855,11 @@ define amdgpu_ps double @dyn_extract_v7f64_s_v(<7 x double> inreg %vec, i32 %sel
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -2879,6 +2890,9 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v14
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v14
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: dyn_extract_v7f64_v_v:
@@ -2903,6 +2917,9 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: dyn_extract_v7f64_v_v:
@@ -2921,6 +2938,8 @@ define double @dyn_extract_v7f64_v_v(<7 x double> %vec, i32 %sel) {
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v14
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <7 x double> %vec, i32 %sel
@@ -3422,42 +3441,82 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
; GCN-NEXT: v_mov_b32_e32 v12, 0x41700000
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v12, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: dyn_extract_v15f32_const_s_v:
-; GFX10PLUS: ; %bb.0: ; %entry
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, 0x41700000, vcc_lo
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: dyn_extract_v15f32_const_s_v:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: dyn_extract_v15f32_const_s_v:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <15 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>, i32 %sel
ret float %ext
@@ -3557,7 +3616,9 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
; GCN-NEXT: v_mov_b32_e32 v15, s16
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: dyn_extract_v15f32_s_v:
@@ -3590,7 +3651,9 @@ define amdgpu_ps float @dyn_extract_v15f32_s_v(<15 x float> inreg %vec, i32 %sel
; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s15, vcc_lo
; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s16, vcc_lo
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s16, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX10PLUS-NEXT: ; return to shader part epilog
entry:
%ext = extractelement <15 x float> %vec, i32 %sel
@@ -3629,41 +3692,81 @@ define float @dyn_extract_v15f32_v_v(<15 x float> %vec, i32 %sel) {
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v15
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: dyn_extract_v15f32_v_v:
-; GFX10PLUS: ; %bb.0: ; %entry
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: dyn_extract_v15f32_v_v:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: dyn_extract_v15f32_v_v:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <15 x float> %vec, i32 %sel
ret float %ext
@@ -3825,6 +3928,8 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15
; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 15, v15
+; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
; MOVREL-LABEL: dyn_extract_v15f32_v_v_offset3:
@@ -3859,42 +3964,83 @@ define float @dyn_extract_v15f32_v_v_offset3(<15 x float> %vec, i32 %sel) {
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 14, v15
; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
+; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc, 15, v15
+; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; MOVREL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10PLUS-LABEL: dyn_extract_v15f32_v_v_offset3:
-; GFX10PLUS: ; %bb.0: ; %entry
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10PLUS-NEXT: v_add_nc_u32_e32 v15, 3, v15
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
-; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GFX10-LABEL: dyn_extract_v15f32_v_v_offset3:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: dyn_extract_v15f32_v_v_offset3:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v15
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%add = add i32 %sel, 3
%ext = extractelement <15 x float> %vec, i32 %add
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 730e3f5dcd33b..7d5c1fade0b3b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -5670,6 +5670,10 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
; GPRIDX-NEXT: s_mov_b32 s12, s14
; GPRIDX-NEXT: s_mov_b32 s13, s15
; GPRIDX-NEXT: v_mov_b32_e32 v18, s15
+; GPRIDX-NEXT: v_mov_b32_e32 v3, s0
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GPRIDX-NEXT: v_mov_b32_e32 v17, s14
; GPRIDX-NEXT: v_mov_b32_e32 v16, s13
; GPRIDX-NEXT: v_mov_b32_e32 v15, s12
@@ -5683,43 +5687,39 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
; GPRIDX-NEXT: v_mov_b32_e32 v7, s4
; GPRIDX-NEXT: v_mov_b32_e32 v6, s3
; GPRIDX-NEXT: v_mov_b32_e32 v5, s2
-; GPRIDX-NEXT: v_mov_b32_e32 v4, s1
-; GPRIDX-NEXT: v_mov_b32_e32 v3, s0
-; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2
-; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[10:11]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v0, s[4:5]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v0, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[8:9]
-; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[10:11]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9]
; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3
-; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4
-; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2
-; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6
-; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5
-; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8
-; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7
-; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10
-; GPRIDX-NEXT: v_readfirstlane_b32 s8, v9
-; GPRIDX-NEXT: v_readfirstlane_b32 s9, v12
-; GPRIDX-NEXT: v_readfirstlane_b32 s10, v11
-; GPRIDX-NEXT: v_readfirstlane_b32 s11, v13
-; GPRIDX-NEXT: v_readfirstlane_b32 s12, v0
-; GPRIDX-NEXT: v_readfirstlane_b32 s13, v1
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GPRIDX-NEXT: v_readfirstlane_b32 s1, v3
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc
+; GPRIDX-NEXT: v_readfirstlane_b32 s2, v3
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v7, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v8, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v11, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v12, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v14, v1, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
+; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v15, v0, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v16, v1, vcc
+; GPRIDX-NEXT: v_readfirstlane_b32 s3, v3
+; GPRIDX-NEXT: v_readfirstlane_b32 s4, v4
+; GPRIDX-NEXT: v_readfirstlane_b32 s5, v5
+; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6
+; GPRIDX-NEXT: v_readfirstlane_b32 s7, v7
+; GPRIDX-NEXT: v_readfirstlane_b32 s8, v8
+; GPRIDX-NEXT: v_readfirstlane_b32 s9, v9
+; GPRIDX-NEXT: v_readfirstlane_b32 s10, v10
+; GPRIDX-NEXT: v_readfirstlane_b32 s11, v11
+; GPRIDX-NEXT: v_readfirstlane_b32 s12, v12
+; GPRIDX-NEXT: v_readfirstlane_b32 s13, v13
; GPRIDX-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: dyn_insertelement_v7f64_s_v_v:
@@ -5739,9 +5739,13 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
; GFX10-NEXT: s_mov_b32 s12, s14
; GFX10-NEXT: s_mov_b32 s13, s15
; GFX10-NEXT: v_mov_b32_e32 v18, s15
+; GFX10-NEXT: v_mov_b32_e32 v3, s0
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, s1
; GFX10-NEXT: v_mov_b32_e32 v17, s14
; GFX10-NEXT: v_mov_b32_e32 v16, s13
; GFX10-NEXT: v_mov_b32_e32 v15, s12
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v14, s11
; GFX10-NEXT: v_mov_b32_e32 v13, s10
; GFX10-NEXT: v_mov_b32_e32 v12, s9
@@ -5752,43 +5756,39 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
; GFX10-NEXT: v_mov_b32_e32 v7, s4
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: v_mov_b32_e32 v5, s2
-; GFX10-NEXT: v_mov_b32_e32 v4, s1
-; GFX10-NEXT: v_mov_b32_e32 v3, s0
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v2
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 6, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s0, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT: v_readfirstlane_b32 s1, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s2, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
+; GFX10-NEXT: v_readfirstlane_b32 s4, v4
+; GFX10-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v2
-; GFX10-NEXT: v_readfirstlane_b32 s2, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo
-; GFX10-NEXT: v_readfirstlane_b32 s3, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v3
-; GFX10-NEXT: v_readfirstlane_b32 s1, v4
-; GFX10-NEXT: v_readfirstlane_b32 s4, v7
-; GFX10-NEXT: v_readfirstlane_b32 s5, v8
-; GFX10-NEXT: v_readfirstlane_b32 s6, v9
-; GFX10-NEXT: v_readfirstlane_b32 s7, v10
-; GFX10-NEXT: v_readfirstlane_b32 s8, v11
-; GFX10-NEXT: v_readfirstlane_b32 s9, v2
-; GFX10-NEXT: v_readfirstlane_b32 s10, v12
-; GFX10-NEXT: v_readfirstlane_b32 s11, v13
-; GFX10-NEXT: v_readfirstlane_b32 s12, v0
-; GFX10-NEXT: v_readfirstlane_b32 s13, v1
+; GFX10-NEXT: v_readfirstlane_b32 s6, v6
+; GFX10-NEXT: v_readfirstlane_b32 s7, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v11, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v12, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v8
+; GFX10-NEXT: v_readfirstlane_b32 s9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v14, v1, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
+; GFX10-NEXT: v_readfirstlane_b32 s10, v10
+; GFX10-NEXT: v_readfirstlane_b32 s11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v0, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v16, v1, vcc_lo
+; GFX10-NEXT: v_readfirstlane_b32 s12, v12
+; GFX10-NEXT: v_readfirstlane_b32 s13, v13
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: dyn_insertelement_v7f64_s_v_v:
@@ -5808,45 +5808,45 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_s_v_v(<7 x double> inreg
; GFX11-NEXT: s_mov_b32 s12, s14
; GFX11-NEXT: s_mov_b32 s13, s15
; GFX11-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
+; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_dual_mov_b32 v16, s13 :: v_dual_mov_b32 v15, s12
; GFX11-NEXT: v_dual_mov_b32 v14, s11 :: v_dual_mov_b32 v13, s10
; GFX11-NEXT: v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v11, s8
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v9, s6
; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v7, s4
; GFX11-NEXT: v_dual_mov_b32 v6, s3 :: v_dual_mov_b32 v5, s2
-; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v2
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v0 :: v_dual_cndmask_b32 v4, v4, v1
+; GFX11-NEXT: v_dual_cndmask_b32 v18, v3, v0 :: v_dual_cndmask_b32 v17, v4, v1
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v2
-; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v0 :: v_dual_cndmask_b32 v8, v8, v1
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v2
-; GFX11-NEXT: v_readfirstlane_b32 s2, v5
-; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v0 :: v_dual_cndmask_b32 v2, v12, v1
-; GFX11-NEXT: v_readfirstlane_b32 s3, v6
-; GFX11-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1
-; GFX11-NEXT: v_readfirstlane_b32 s0, v3
-; GFX11-NEXT: v_readfirstlane_b32 s1, v4
-; GFX11-NEXT: v_readfirstlane_b32 s4, v7
-; GFX11-NEXT: v_readfirstlane_b32 s5, v8
-; GFX11-NEXT: v_readfirstlane_b32 s6, v9
-; GFX11-NEXT: v_readfirstlane_b32 s7, v10
+; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 4, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v0, s0
+; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v1, s0
+; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 6, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v6, v7, v0 :: v_dual_cndmask_b32 v5, v8, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6
+; GFX11-NEXT: v_readfirstlane_b32 s0, v18
+; GFX11-NEXT: v_readfirstlane_b32 s1, v17
+; GFX11-NEXT: v_readfirstlane_b32 s2, v3
+; GFX11-NEXT: v_dual_cndmask_b32 v7, v9, v0 :: v_dual_cndmask_b32 v8, v10, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
+; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, v1, s6
+; GFX11-NEXT: v_cndmask_b32_e64 v12, v15, v0, s9
+; GFX11-NEXT: v_readfirstlane_b32 s3, v4
+; GFX11-NEXT: v_readfirstlane_b32 s4, v6
+; GFX11-NEXT: v_cndmask_b32_e32 v10, v13, v0, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v1, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v16, v1, s9
+; GFX11-NEXT: v_readfirstlane_b32 s5, v5
+; GFX11-NEXT: v_readfirstlane_b32 s6, v7
+; GFX11-NEXT: v_readfirstlane_b32 s7, v8
; GFX11-NEXT: v_readfirstlane_b32 s8, v11
-; GFX11-NEXT: v_readfirstlane_b32 s9, v2
-; GFX11-NEXT: v_readfirstlane_b32 s10, v12
-; GFX11-NEXT: v_readfirstlane_b32 s11, v13
-; GFX11-NEXT: v_readfirstlane_b32 s12, v0
-; GFX11-NEXT: v_readfirstlane_b32 s13, v1
+; GFX11-NEXT: v_readfirstlane_b32 s9, v9
+; GFX11-NEXT: v_readfirstlane_b32 s10, v10
+; GFX11-NEXT: v_readfirstlane_b32 s11, v14
+; GFX11-NEXT: v_readfirstlane_b32 s12, v12
+; GFX11-NEXT: v_readfirstlane_b32 s13, v13
; GFX11-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <7 x double> %vec, double %val, i32 %idx
@@ -5908,26 +5908,26 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; GPRIDX-LABEL: dyn_insertelement_v7f64_v_v_v:
; GPRIDX: ; %bb.0: ; %entry
; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v16
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v16
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v16
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v16
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v16
-; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 6, v16
; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[8:9]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[10:11]
; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
-; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[0:1]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[6:7]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9]
-; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[10:11]
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
+; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
+; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
+; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
+; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
+; GPRIDX-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
+; GPRIDX-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc
+; GPRIDX-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc
; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0
; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1
; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2
@@ -5947,38 +5947,38 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; GFX10-LABEL: dyn_insertelement_v7f64_v_v_v:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 2, v16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 3, v16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 4, v16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v16
-; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 6, v16
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v14, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v15, s1
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16
; GFX10-NEXT: v_readfirstlane_b32 s8, v8
; GFX10-NEXT: v_readfirstlane_b32 s9, v9
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16
; GFX10-NEXT: v_readfirstlane_b32 s10, v10
; GFX10-NEXT: v_readfirstlane_b32 s11, v11
+; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v14, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v15, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s12, v12
; GFX10-NEXT: v_readfirstlane_b32 s13, v13
; GFX10-NEXT: ; return to shader part epilog
@@ -5986,14 +5986,14 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec,
; GFX11-LABEL: dyn_insertelement_v7f64_v_v_v:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v16
-; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 6, v16
+; GFX11-NEXT: v_cmp_eq_u32_e64 s9, 5, v16
+; GFX11-NEXT: v_cmp_eq_u32_e64 s10, 6, v16
; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s0
-; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v14, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v15, s9
+; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v14, s10
+; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v15, s10
; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v14 :: v_dual_cndmask_b32 v3, v3, v15
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
More information about the llvm-commits
mailing list