[llvm] 24b67a9 - [AMDGPU][GlobalISel] Improve regbankselect for 64-bit VGPR ctlz_zero_undef/cttz_zero_undef
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 6 01:42:41 PDT 2021
Author: Jay Foad
Date: 2021-08-06T09:40:48+01:00
New Revision: 24b67a9024cc1a757466b4a40c05b4fd8e4b3c69
URL: https://github.com/llvm/llvm-project/commit/24b67a9024cc1a757466b4a40c05b4fd8e4b3c69
DIFF: https://github.com/llvm/llvm-project/commit/24b67a9024cc1a757466b4a40c05b4fd8e4b3c69.diff
LOG: [AMDGPU][GlobalISel] Improve regbankselect for 64-bit VGPR ctlz_zero_undef/cttz_zero_undef
We can improve on the generic splitting by using ffbh/ffbl, which have a
defined result when the input is zero.
Differential Revision: https://reviews.llvm.org/D107442
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
llvm/test/CodeGen/AMDGPU/ctlz.ll
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
llvm/test/CodeGen/AMDGPU/cttz.ll
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8cdfcddd9251..e5dc0d80fb06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2491,9 +2491,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_CTPOP:
- case AMDGPU::G_BITREVERSE:
- case AMDGPU::G_CTLZ_ZERO_UNDEF:
- case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+ case AMDGPU::G_BITREVERSE: {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
@@ -2515,6 +2513,38 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
llvm_unreachable("narrowScalar should have succeeded");
return;
}
+ case AMDGPU::G_CTLZ_ZERO_UNDEF:
+ case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::SGPRRegBank)
+ break;
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ const LLT S32 = LLT::scalar(32);
+ LLT Ty = MRI.getType(SrcReg);
+ if (Ty == S32)
+ break;
+
+ // We can narrow this more efficiently than Helper can by using ffbh/ffbl
+ // which return -1 when the input is zero:
+ // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), 32 + (ffbh lo))
+ // (cttz_zero_undef hi:lo) -> (umin 32 + (ffbl hi), (ffbl lo))
+ ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
+ MachineIRBuilder B(MI, ApplyVALU);
+ SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
+ unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
+ ? AMDGPU::G_AMDGPU_FFBH_U32
+ : AMDGPU::G_AMDGPU_FFBL_B32;
+ unsigned Idx = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF;
+ auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
+ auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
+ Y = B.buildAdd(S32, Y, B.buildConstant(S32, 32));
+ Register DstReg = MI.getOperand(0).getReg();
+ B.buildUMin(DstReg, X, Y);
+ MI.eraseFromParent();
+ return;
+ }
case AMDGPU::G_SEXT:
case AMDGPU::G_ZEXT:
case AMDGPU::G_ANYEXT: {
@@ -3729,7 +3759,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_CTLZ_ZERO_UNDEF:
- case AMDGPU::G_CTTZ_ZERO_UNDEF:
+ case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+ unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
+ OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
+ break;
+ }
case AMDGPU::G_CTPOP: {
unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir
index 991da6c35e61..77c9c0c65618 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ctlz-zero-undef.mir
@@ -60,14 +60,12 @@ body: |
; CHECK-LABEL: name: ctlz_zero_undef_s64_v
; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[C]]
- ; CHECK: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[UV]](s32)
- ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32
- ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTLZ_ZERO_UNDEF]], [[C1]]
- ; CHECK: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTLZ_ZERO_UNDEF [[UV1]](s32)
- ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[CTLZ_ZERO_UNDEF1]]
- ; CHECK: S_ENDPGM 0, implicit [[SELECT]](s32)
+ ; CHECK: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
+ ; CHECK: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV]](s32)
+ ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32
+ ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBH_U32_1]], [[C]]
+ ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[ADD]]
+ ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s32) = G_CTLZ_ZERO_UNDEF %0
S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
index a07b595005c0..3eb1ae47b4db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
@@ -60,14 +60,12 @@ body: |
; CHECK-LABEL: name: cttz_zero_undef_s64_v
; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
- ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0
- ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[UV]](s32), [[C]]
- ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32)
- ; CHECK: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32
- ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTTZ_ZERO_UNDEF]], [[C1]]
- ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32)
- ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[ADD]], [[CTTZ_ZERO_UNDEF1]]
- ; CHECK: S_ENDPGM 0, implicit [[SELECT]](s32)
+ ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV]](s32)
+ ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s32)
+ ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32
+ ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBL_B32_1]], [[C]]
+ ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[ADD]]
+ ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s32) = G_CTTZ_ZERO_UNDEF %0
S_ENDPGM 0, implicit %1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 1b1fd1b5a630..c9ccb02a2ce3 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -778,20 +778,19 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; GFX10-GISEL-LABEL: v_ctlz_i64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v0
+; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v0
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v1
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 64, vcc_lo
-; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3
+; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 64, vcc_lo
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -900,10 +899,9 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v1
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v4, v2
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[1:2]
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3
+; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 64, vcc_lo
; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 0f83caea9bb0..19aec679d09a 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -599,17 +599,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out,
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
+; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
-; GFX9-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[2:3]
+; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -698,10 +697,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias
; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1
; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 54f7e238b230..7f22262a3b68 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -770,20 +770,19 @@ define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; GFX10-GISEL-LABEL: v_cttz_i64:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v1
+; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v1
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v0
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v4, 64, vcc_lo
-; GFX10-GISEL-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3
+; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3
+; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 64, vcc_lo
+; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
@@ -892,10 +891,9 @@ define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v2
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v4, v1
-; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3
-; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
; GFX10-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[1:2]
+; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3
+; GFX10-GISEL-NEXT: v_min_u32_e32 v3, v4, v3
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 64, vcc_lo
; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 36e4773f53e1..0b89327391bf 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1065,16 +1065,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* n
; GFX9-GISEL-NEXT: v_or_b32_sdwa v4, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-GISEL-NEXT: v_bfe_u32 v3, v3, 0, 16
; GFX9-GISEL-NEXT: v_bfe_u32 v4, v4, 0, 16
+; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX9-GISEL-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX9-GISEL-NEXT: v_bfe_u32 v2, v2, 0, 16
-; GFX9-GISEL-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX9-GISEL-NEXT: v_lshl_or_b32 v2, v2, 16, v0
-; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v3
-; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v2
-; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0
-; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3
+; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2
+; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4
; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc
; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
More information about the llvm-commits
mailing list