[llvm] 69192e0 - [LegalizeDAG] Optimize CodeGen for `ISD::CTLZ_ZERO_UNDEF` (#83039)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 8 06:01:36 PDT 2024
Author: Manish Kausik H
Date: 2024-07-08T14:01:32+01:00
New Revision: 69192e0193e60c169c7776f444362dffba31eb7d
URL: https://github.com/llvm/llvm-project/commit/69192e0193e60c169c7776f444362dffba31eb7d
DIFF: https://github.com/llvm/llvm-project/commit/69192e0193e60c169c7776f444362dffba31eb7d.diff
LOG: [LegalizeDAG] Optimize CodeGen for `ISD::CTLZ_ZERO_UNDEF` (#83039)
Previously we had the same instructions being generated for `ISD::CTLZ` and `ISD::CTLZ_ZERO_UNDEF` which did not take advantage of the fact that zero is an invalid input for `ISD::CTLZ_ZERO_UNDEF`. This commit separates codegen for the two cases to allow for the optimization for the latter case.
The details of the optimization are outlined in #82075
Fixes #82075
Co-authored-by: Manish Kausik H <hmamishkausik at gmail.com>
Added:
llvm/test/CodeGen/AArch64/ctlz_zero_undef.ll
Modified:
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
llvm/test/CodeGen/Hexagon/bitmanip.ll
llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll
llvm/test/CodeGen/VE/Scalar/ctlz.ll
llvm/test/CodeGen/X86/ctlo.ll
llvm/test/CodeGen/X86/ctlz.ll
llvm/test/CodeGen/X86/lzcnt.ll
llvm/test/CodeGen/X86/pr38539.ll
llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 86de1f3be9047..3f1094e0ac703 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2461,13 +2461,22 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
}
+ unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
+
+ if (MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
+ // An optimization where the result is the CTLZ after the left shift by
+ // (Difference in widety and current ty), that is,
+ // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
+ // Result = ctlz MIBSrc
+ MIBSrc = MIRBuilder.buildShl(WideTy, MIBSrc,
+ MIRBuilder.buildConstant(WideTy, SizeDiff));
+ }
+
// Perform the operation at the larger size.
auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
// This is already the correct result for CTPOP and CTTZs
- if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
- MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
+ if (MI.getOpcode() == TargetOpcode::G_CTLZ) {
// The correct result is NewOp - (Difference in widety and current ty).
- unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
MIBNewOp = MIRBuilder.buildSub(
WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d036a0285e571..d6a0dd9ae9b20 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5083,7 +5083,6 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
- case ISD::CTLZ_ZERO_UNDEF:
case ISD::CTPOP: {
// Zero extend the argument unless its cttz, then use any_extend.
if (Node->getOpcode() == ISD::CTTZ ||
@@ -5106,7 +5105,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
// Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
// already the correct result.
Tmp1 = DAG.getNode(NewOpc, dl, NVT, Tmp1);
- if (NewOpc == ISD::CTLZ || NewOpc == ISD::CTLZ_ZERO_UNDEF) {
+ if (NewOpc == ISD::CTLZ) {
// Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
DAG.getConstant(NVT.getSizeInBits() -
@@ -5115,6 +5114,25 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
break;
}
+ case ISD::CTLZ_ZERO_UNDEF: {
+ // We know that the argument is unlikely to be zero, hence we can take a
+ //
diff erent approach as compared to ISD::CTLZ
+
+ // Any Extend the argument
+ auto AnyExtendedNode =
+ DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
+
+ // Tmp1 = Tmp1 << (sizeinbits(NVT) - sizeinbits(Old VT))
+ auto ShiftConstant = DAG.getShiftAmountConstant(
+ NVT.getSizeInBits() - OVT.getSizeInBits(), NVT, dl);
+ auto LeftShiftResult =
+ DAG.getNode(ISD::SHL, dl, NVT, AnyExtendedNode, ShiftConstant);
+
+ // Perform the larger operation
+ auto CTLZResult = DAG.getNode(Node->getOpcode(), dl, NVT, LeftShiftResult);
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, CTLZResult));
+ break;
+ }
case ISD::BITREVERSE:
case ISD::BSWAP: {
unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 38f8f072dccfd..fed5ebcc3c903 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -655,24 +655,46 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
}
}
- // Subtract off the extra leading bits in the bigger type.
- SDValue ExtractLeadingBits = DAG.getConstant(
- NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
- if (!N->isVPOpcode()) {
+ unsigned CtlzOpcode = N->getOpcode();
+ if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
+ // Subtract off the extra leading bits in the bigger type.
+ SDValue ExtractLeadingBits = DAG.getConstant(
+ NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl, NVT);
+
+ if (!N->isVPOpcode()) {
+ // Zero extend to the promoted type and do the count there.
+ SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+ return DAG.getNode(ISD::SUB, dl, NVT,
+ DAG.getNode(N->getOpcode(), dl, NVT, Op),
+ ExtractLeadingBits);
+ }
+ SDValue Mask = N->getOperand(1);
+ SDValue EVL = N->getOperand(2);
// Zero extend to the promoted type and do the count there.
- SDValue Op = ZExtPromotedInteger(N->getOperand(0));
- return DAG.getNode(ISD::SUB, dl, NVT,
- DAG.getNode(N->getOpcode(), dl, NVT, Op),
- ExtractLeadingBits);
- }
+ SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+ return DAG.getNode(ISD::VP_SUB, dl, NVT,
+ DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
+ ExtractLeadingBits, Mask, EVL);
+ }
+ if (CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
+ CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) {
+ // Any Extend the argument
+ SDValue Op = GetPromotedInteger(N->getOperand(0));
+ // Op = Op << (sizeinbits(NVT) - sizeinbits(Old VT))
+ unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+ auto ShiftConst =
+ DAG.getShiftAmountConstant(SHLAmount, Op.getValueType(), dl);
+ if (!N->isVPOpcode()) {
+ Op = DAG.getNode(ISD::SHL, dl, NVT, Op, ShiftConst);
+ return DAG.getNode(CtlzOpcode, dl, NVT, Op);
+ }
- SDValue Mask = N->getOperand(1);
- SDValue EVL = N->getOperand(2);
- // Zero extend to the promoted type and do the count there.
- SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
- return DAG.getNode(ISD::VP_SUB, dl, NVT,
- DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
- ExtractLeadingBits, Mask, EVL);
+ SDValue Mask = N->getOperand(1);
+ SDValue EVL = N->getOperand(2);
+ Op = DAG.getNode(ISD::VP_SHL, dl, NVT, Op, ShiftConst, Mask, EVL);
+ return DAG.getNode(CtlzOpcode, dl, NVT, Op, Mask, EVL);
+ }
+ llvm_unreachable("Invalid CTLZ Opcode");
}
SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/ctlz_zero_undef.ll b/llvm/test/CodeGen/AArch64/ctlz_zero_undef.ll
new file mode 100644
index 0000000000000..943ff72633ca6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlz_zero_undef.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s --mtriple=aarch64 | FileCheck %s
+
+declare i8 @llvm.ctlz.i8(i8, i1 immarg)
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1 immarg)
+declare i11 @llvm.ctlz.i11(i11, i1 immarg)
+
+define i32 @clz_nzu8(i8 %self) {
+; CHECK-LABEL: clz_nzu8:
+; CHECK: // %bb.0: // %start
+; CHECK-NEXT: lsl w8, w0, #24
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+start:
+ %ctlz_res = call i8 @llvm.ctlz.i8(i8 %self, i1 true)
+ %ret = zext i8 %ctlz_res to i32
+ ret i32 %ret
+}
+
+; non standard bit size argument to ctlz
+define i32 @clz_nzu11(i11 %self) {
+; CHECK-LABEL: clz_nzu11:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl w8, w0, #21
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %ctlz_res = call i11 @llvm.ctlz.i11(i11 %self, i1 true)
+ %ret = zext i11 %ctlz_res to i32
+ ret i32 %ret
+}
+
+; vector type argument to ctlz intrinsic
+define <8 x i32> @clz_vec_nzu8(<8 x i8> %self) {
+; CHECK-LABEL: clz_vec_nzu8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: clz v0.8b, v0.8b
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: ret
+ %ctlz_res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %self, i1 true)
+ %ret = zext <8 x i8> %ctlz_res to <8 x i32>
+ ret <8 x i32> %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index 85cfb9b320f15..68587630e2195 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -200,11 +200,10 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s64)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
- ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C1]](s64)
- ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[CTLZ_ZERO_UNDEF]], [[UV]]
- ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[C1]](s32)
+ ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[SHL]](s64)
+ ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[CTLZ_ZERO_UNDEF]](s32)
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s33) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 756b819099682..d269eb680138b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -408,9 +408,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
+; VI-NEXT: s_lshl_b32 s2, s2, 16
; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_add_i32 s2, s2, -16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -712,8 +711,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2168,18 +2167,15 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) {
; SI-LABEL: v_ctlz_zero_undef_i7:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_ctlz_zero_undef_i7:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0
-; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_add_u16_e32 v0, -9, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_i7:
@@ -2204,9 +2200,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 0x3ffff
-; SI-NEXT: s_flbit_i32_b32 s2, s2
-; SI-NEXT: s_add_i32 s4, s2, -14
+; SI-NEXT: s_lshl_b32 s2, s2, 14
+; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_bfe_u32 s4, s4, 0x20010
@@ -2221,15 +2216,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x3ffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
+; VI-NEXT: s_lshl_b32 s2, s2, 14
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s2, s2, -14
+; VI-NEXT: s_flbit_i32_b32 s2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_add_u32 s0, s0, 2
-; VI-NEXT: flat_store_short v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2239,20 +2233,18 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
;
; EG-LABEL: s_ctlz_zero_undef_i18:
; EG: ; %bb.0:
-; EG-NEXT: ALU 30, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X
; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 4:
-; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x,
-; EG-NEXT: 262143(3.673406e-40), 0(0.000000e+00)
+; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
+; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00)
; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -14(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T2.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T1.X, PV.W, PS,
; EG-NEXT: LSHL * T1.W, literal.x, PS,
@@ -2300,17 +2292,15 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) {
; SI-LABEL: v_ctlz_zero_undef_i18:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_ctlz_zero_undef_i18:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_i18:
@@ -2332,23 +2322,19 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
; SI-LABEL: v_ctlz_zero_undef_v2i18:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1
-; SI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 14, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_ffbh_u32_e32 v1, v1
-; SI-NEXT: v_add_i32_e32 v0, vcc, -14, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, -14, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_ctlz_zero_undef_v2i18:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v1, 0x3ffff, v1
-; VI-NEXT: v_and_b32_e32 v0, 0x3ffff, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 14, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
; VI-NEXT: v_ffbh_u32_e32 v1, v1
-; VI-NEXT: v_add_u32_e32 v0, vcc, -14, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, -14, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_v2i18:
@@ -2383,11 +2369,11 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
; VI-LABEL: v_ctlz_zero_undef_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_ffbh_u32_e32 v0, v0
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_v2i16:
@@ -2429,13 +2415,13 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
; VI-LABEL: v_ctlz_zero_undef_v3i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_ffbh_u32_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
+; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_v3i16:
@@ -2483,16 +2469,16 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
; VI-LABEL: v_ctlz_zero_undef_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_ffbh_u32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_ffbh_u32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; VI-NEXT: v_add_u32_e32 v0, vcc, 0xfff00000, v0
-; VI-NEXT: v_add_u32_e32 v1, vcc, 0xfff00000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_ffbh_u32_e32 v3, v3
+; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_v4i16:
@@ -2567,28 +2553,19 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
; SI-LABEL: v_ctlz_zero_undef_v2i7:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0x7f, v1
-; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 25, v1
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_ffbh_u32_e32 v1, v1
-; SI-NEXT: v_subrev_i32_e32 v0, vcc, 25, v0
-; SI-NEXT: v_subrev_i32_e32 v1, vcc, 25, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_ctlz_zero_undef_v2i7:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_and_b32_e32 v2, 0x7f007f, v0
-; VI-NEXT: v_bfe_u32 v0, v0, 16, 7
+; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 25, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_add_u16_e32 v1, -9, v0
-; VI-NEXT: v_and_b32_e32 v0, 0x7f, v2
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT: v_add_u16_e32 v0, -9, v0
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: v_ctlz_zero_undef_v2i7:
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
index fea02822da8bf..c8fee5d334429 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -150,22 +150,23 @@ body: |
%1(s8) = G_TRUNC %0(s32)
; Check that the operation is performed for 32 bits
- ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ
- ; CLZ-NOT: G_CTLZ_ZERO_UNDEF
+ ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+ ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SHL [[X32]], [[BITDIFF]]
; LIBCALLS-NOT: G_CTLZ
; LIBCALLS: ADJCALLSTACKDOWN
- ; LIBCALLS: $r0 = COPY [[X32]]
+ ; LIBCALLS: $r0 = COPY [[R32]]
; LIBCALLS: BL &__clzsi2, {{.*}}, implicit $r0, implicit-def $r0
; LIBCALLS: [[COUNT:%[0-9]+]]:_(s32) = COPY $r0
; LIBCALLS: ADJCALLSTACKUP
; LIBCALLS-NOT: G_CTLZ
- ; CHECK: [[BITDIFF:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
- ; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
+ ; CLZ: [[COUNT:%[0-9]+]]:_(s32) = G_CTLZ [[R32]]
+ ; CLZ-NOT: G_CTLZ_ZERO_UNDEF
%2(s8) = G_CTLZ_ZERO_UNDEF %1
- ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
- ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
- ; CHECK: $r0 = COPY [[R]]
+ ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[COUNT]], [[BITDIFF]]
+ ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+ ; CLZ: $r0 = COPY [[COUNT]]
+ ; LIBCALLS: $r0 = COPY [[R]]
%3(s32) = G_SEXT %2(s8)
$r0 = COPY %3(s32)
BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/test/CodeGen/Hexagon/bitmanip.ll b/llvm/test/CodeGen/Hexagon/bitmanip.ll
index 9ce7f0576506c..2c21af62d6f39 100644
--- a/llvm/test/CodeGen/Hexagon/bitmanip.ll
+++ b/llvm/test/CodeGen/Hexagon/bitmanip.ll
@@ -50,13 +50,10 @@ define i16 @ctlz_i16(i16 %a0) #0 {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
-; CHECK-NEXT: r0 = zxth(r0)
+; CHECK-NEXT: r0 = aslh(r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = cl0(r0)
-; CHECK-NEXT: }
-; CHECK-NEXT: {
-; CHECK-NEXT: r0 = add(r0,#-16)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: }
%v0 = tail call i16 @llvm.ctlz.i16(i16 %a0, i1 true) #1
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index a90c244437a03..8caa64c9572ce 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -1671,30 +1671,26 @@ define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
;
; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: andi a0, a0, 255
+; RV32ZBB-NEXT: slli a0, a0, 24
; RV32ZBB-NEXT: clz a0, a0
-; RV32ZBB-NEXT: addi a0, a0, -24
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: test_ctlz_i8_zero_undef:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: andi a0, a0, 255
+; RV64ZBB-NEXT: slli a0, a0, 56
; RV64ZBB-NEXT: clz a0, a0
-; RV64ZBB-NEXT: addi a0, a0, -56
; RV64ZBB-NEXT: ret
;
; RV32XTHEADBB-LABEL: test_ctlz_i8_zero_undef:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: andi a0, a0, 255
+; RV32XTHEADBB-NEXT: slli a0, a0, 24
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
-; RV32XTHEADBB-NEXT: addi a0, a0, -24
; RV32XTHEADBB-NEXT: ret
;
; RV64XTHEADBB-LABEL: test_ctlz_i8_zero_undef:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: andi a0, a0, 255
+; RV64XTHEADBB-NEXT: slli a0, a0, 56
; RV64XTHEADBB-NEXT: th.ff1 a0, a0
-; RV64XTHEADBB-NEXT: addi a0, a0, -56
; RV64XTHEADBB-NEXT: ret
%tmp = call i8 @llvm.ctlz.i8(i8 %a, i1 true)
ret i8 %tmp
@@ -1771,30 +1767,26 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
;
; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
; RV32ZBB: # %bb.0:
-; RV32ZBB-NEXT: zext.h a0, a0
+; RV32ZBB-NEXT: slli a0, a0, 16
; RV32ZBB-NEXT: clz a0, a0
-; RV32ZBB-NEXT: addi a0, a0, -16
; RV32ZBB-NEXT: ret
;
; RV64ZBB-LABEL: test_ctlz_i16_zero_undef:
; RV64ZBB: # %bb.0:
-; RV64ZBB-NEXT: zext.h a0, a0
+; RV64ZBB-NEXT: slli a0, a0, 48
; RV64ZBB-NEXT: clz a0, a0
-; RV64ZBB-NEXT: addi a0, a0, -48
; RV64ZBB-NEXT: ret
;
; RV32XTHEADBB-LABEL: test_ctlz_i16_zero_undef:
; RV32XTHEADBB: # %bb.0:
-; RV32XTHEADBB-NEXT: th.extu a0, a0, 15, 0
+; RV32XTHEADBB-NEXT: slli a0, a0, 16
; RV32XTHEADBB-NEXT: th.ff1 a0, a0
-; RV32XTHEADBB-NEXT: addi a0, a0, -16
; RV32XTHEADBB-NEXT: ret
;
; RV64XTHEADBB-LABEL: test_ctlz_i16_zero_undef:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: th.extu a0, a0, 15, 0
+; RV64XTHEADBB-NEXT: slli a0, a0, 48
; RV64XTHEADBB-NEXT: th.ff1 a0, a0
-; RV64XTHEADBB-NEXT: addi a0, a0, -48
; RV64XTHEADBB-NEXT: ret
%tmp = call i16 @llvm.ctlz.i16(i16 %a, i1 true)
ret i16 %tmp
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
index acd63f24bb8f7..80d3add385969 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
@@ -236,13 +236,13 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
;
; RV64XTHEADBB-LABEL: findLastSet_i32:
; RV64XTHEADBB: # %bb.0:
-; RV64XTHEADBB-NEXT: th.extu a1, a0, 31, 0
+; RV64XTHEADBB-NEXT: slli a1, a0, 32
; RV64XTHEADBB-NEXT: th.ff1 a1, a1
-; RV64XTHEADBB-NEXT: addiw a1, a1, -32
; RV64XTHEADBB-NEXT: xori a1, a1, 31
; RV64XTHEADBB-NEXT: snez a0, a0
-; RV64XTHEADBB-NEXT: addiw a0, a0, -1
+; RV64XTHEADBB-NEXT: addi a0, a0, -1
; RV64XTHEADBB-NEXT: or a0, a0, a1
+; RV64XTHEADBB-NEXT: sext.w a0, a0
; RV64XTHEADBB-NEXT: ret
%1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
%2 = xor i32 31, %1
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index df413b878172b..58882525e55c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2604,9 +2604,8 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vp_ctlz_zero_undef_nxv1i9:
; CHECK: # %bb.0:
-; CHECK-NEXT: li a1, 511
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
@@ -2614,18 +2613,13 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
; CHECK-NEXT: li a0, 142
; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
-; CHECK-NEXT: li a0, 7
-; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-NEXT: ret
;
; CHECK-ZVBB-LABEL: vp_ctlz_zero_undef_nxv1i9:
; CHECK-ZVBB: # %bb.0:
-; CHECK-ZVBB-NEXT: li a1, 511
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
-; CHECK-ZVBB-NEXT: li a0, 7
-; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
; CHECK-ZVBB-NEXT: ret
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll
index e932210d3e71f..da687095045ff 100644
--- a/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll
+++ b/llvm/test/CodeGen/SystemZ/scalar-ctlz-01.ll
@@ -43,10 +43,10 @@ define i32 @f2(i32 %arg) {
define i32 @f3(i32 %arg) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
-; CHECK-NEXT: llgfr %r0, %r2
+; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
+; CHECK-NEXT: sllg %r0, %r2, 32
; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi %r2, -32
-; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2q
; CHECK-NEXT: br %r14
%1 = tail call i32 @llvm.ctlz.i32(i32 %arg, i1 true)
ret i32 %1
@@ -69,10 +69,9 @@ define i16 @f5(i16 %arg) {
; CHECK-LABEL: f5:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
-; CHECK-NEXT: llghr %r0, %r2
-; CHECK-NEXT: flogr %r0, %r0
-; CHECK-NEXT: aghi %r0, -32
-; CHECK-NEXT: ahik %r2, %r0, -16
+; CHECK-NEXT: sllg %r0, %r2, 48
+; CHECK-NEXT: flogr %r2, %r0
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2q
; CHECK-NEXT: br %r14
%1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true)
ret i16 %1
@@ -95,10 +94,9 @@ define i8 @f7(i8 %arg) {
; CHECK-LABEL: f7:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $r2l killed $r2l def $r2d
-; CHECK-NEXT: llgcr %r0, %r2
-; CHECK-NEXT: flogr %r0, %r0
-; CHECK-NEXT: aghi %r0, -32
-; CHECK-NEXT: ahik %r2, %r0, -24
+; CHECK-NEXT: sllg %r0, %r2, 56
+; CHECK-NEXT: flogr %r2, %r0
+; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2q
; CHECK-NEXT: br %r14
%1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true)
ret i8 %1
diff --git a/llvm/test/CodeGen/VE/Scalar/ctlz.ll b/llvm/test/CodeGen/VE/Scalar/ctlz.ll
index c8c2b11c5eef6..602b9a86bf032 100644
--- a/llvm/test/CodeGen/VE/Scalar/ctlz.ll
+++ b/llvm/test/CodeGen/VE/Scalar/ctlz.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
declare i128 @llvm.ctlz.i128(i128, i1)
@@ -31,9 +32,8 @@ define i64 @func64(i64 %p) {
define signext i32 @func32s(i32 signext %p) {
; CHECK-LABEL: func32s:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (32)0
+; CHECK-NEXT: sll %s0, %s0, 32
; CHECK-NEXT: ldz %s0, %s0
-; CHECK-NEXT: lea %s0, -32(, %s0)
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i32 @llvm.ctlz.i32(i32 %p, i1 true)
ret i32 %r
@@ -42,9 +42,8 @@ define signext i32 @func32s(i32 signext %p) {
define zeroext i32 @func32z(i32 zeroext %p) {
; CHECK-LABEL: func32z:
; CHECK: # %bb.0:
+; CHECK-NEXT: sll %s0, %s0, 32
; CHECK-NEXT: ldz %s0, %s0
-; CHECK-NEXT: lea %s0, -32(, %s0)
-; CHECK-NEXT: and %s0, %s0, (32)0
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i32 @llvm.ctlz.i32(i32 %p, i1 true)
ret i32 %r
@@ -53,11 +52,8 @@ define zeroext i32 @func32z(i32 zeroext %p) {
define signext i16 @func16s(i16 signext %p) {
; CHECK-LABEL: func16s:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (48)0
+; CHECK-NEXT: sll %s0, %s0, 48
; CHECK-NEXT: ldz %s0, %s0
-; CHECK-NEXT: lea %s0, -32(, %s0)
-; CHECK-NEXT: adds.w.sx %s0, -16, %s0
-; CHECK-NEXT: and %s0, %s0, (48)0
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i16 @llvm.ctlz.i16(i16 %p, i1 true)
ret i16 %r
@@ -66,10 +62,8 @@ define signext i16 @func16s(i16 signext %p) {
define zeroext i16 @func16z(i16 zeroext %p) {
; CHECK-LABEL: func16z:
; CHECK: # %bb.0:
+; CHECK-NEXT: sll %s0, %s0, 48
; CHECK-NEXT: ldz %s0, %s0
-; CHECK-NEXT: lea %s0, -32(, %s0)
-; CHECK-NEXT: adds.w.sx %s0, -16, %s0
-; CHECK-NEXT: and %s0, %s0, (48)0
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i16 @llvm.ctlz.i16(i16 %p, i1 true)
ret i16 %r
@@ -78,11 +72,8 @@ define zeroext i16 @func16z(i16 zeroext %p) {
define signext i8 @func8s(i8 signext %p) {
; CHECK-LABEL: func8s:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
+; CHECK-NEXT: sll %s0, %s0, 56
; CHECK-NEXT: ldz %s0, %s0
-; CHECK-NEXT: lea %s0, -32(, %s0)
-; CHECK-NEXT: adds.w.sx %s0, -24, %s0
-; CHECK-NEXT: and %s0, %s0, (56)0
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i8 @llvm.ctlz.i8(i8 %p, i1 true)
ret i8 %r
@@ -91,10 +82,8 @@ define signext i8 @func8s(i8 signext %p) {
define zeroext i8 @func8z(i8 zeroext %p) {
; CHECK-LABEL: func8z:
; CHECK: # %bb.0:
+; CHECK-NEXT: sll %s0, %s0, 56
; CHECK-NEXT: ldz %s0, %s0
-; CHECK-NEXT: lea %s0, -32(, %s0)
-; CHECK-NEXT: adds.w.sx %s0, -24, %s0
-; CHECK-NEXT: and %s0, %s0, (56)0
; CHECK-NEXT: b.l.t (, %s10)
%r = tail call i8 @llvm.ctlz.i8(i8 %p, i1 true)
ret i8 %r
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index d735ca5d446cd..7431f94f0fdf2 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -92,8 +92,8 @@ define i8 @ctlo_i8_undef(i8 %x) {
; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-CLZ-NEXT: notb %al
; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: shll $24, %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
@@ -101,8 +101,8 @@ define i8 @ctlo_i8_undef(i8 %x) {
; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: notb %dil
; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: shll $24, %eax
; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
%tmp1 = xor i8 %x, -1
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index bd3d36903ee94..87dca62d74168 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86,X86-NOCMOV
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov | FileCheck %s --check-prefixes=X86,X86-CMOV
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
@@ -31,33 +31,31 @@ define i8 @ctlz_i8(i8 %x) {
;
; X86-CLZ-LABEL: ctlz_i8:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: shll $24, %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i8:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
;
; X64-FASTLZCNT-LABEL: ctlz_i8:
; X64-FASTLZCNT: # %bb.0:
-; X64-FASTLZCNT-NEXT: movzbl %dil, %eax
-; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT: addl $-24, %eax
+; X64-FASTLZCNT-NEXT: shll $24, %edi
+; X64-FASTLZCNT-NEXT: lzcntl %edi, %eax
; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
; X64-FASTLZCNT-NEXT: retq
;
; X86-FASTLZCNT-LABEL: ctlz_i8:
; X86-FASTLZCNT: # %bb.0:
-; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT: shll $24, %eax
; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT: addl $-24, %eax
; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
; X86-FASTLZCNT-NEXT: retl
%tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
@@ -664,8 +662,8 @@ define i8 @ctlz_i8_knownbits(i8 %x) {
; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-CLZ-NEXT: orb $64, %al
; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: shll $24, %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
@@ -673,8 +671,8 @@ define i8 @ctlz_i8_knownbits(i8 %x) {
; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: orb $64, %dil
; X64-CLZ-NEXT: movzbl %dil, %eax
+; X64-CLZ-NEXT: shll $24, %eax
; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
;
@@ -682,8 +680,8 @@ define i8 @ctlz_i8_knownbits(i8 %x) {
; X64-FASTLZCNT: # %bb.0:
; X64-FASTLZCNT-NEXT: orb $64, %dil
; X64-FASTLZCNT-NEXT: movzbl %dil, %eax
+; X64-FASTLZCNT-NEXT: shll $24, %eax
; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT: addl $-24, %eax
; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
; X64-FASTLZCNT-NEXT: retq
;
@@ -692,8 +690,8 @@ define i8 @ctlz_i8_knownbits(i8 %x) {
; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-FASTLZCNT-NEXT: orb $64, %al
; X86-FASTLZCNT-NEXT: movzbl %al, %eax
+; X86-FASTLZCNT-NEXT: shll $24, %eax
; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT: addl $-24, %eax
; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
; X86-FASTLZCNT-NEXT: retl
@@ -927,18 +925,17 @@ define i8 @ctlz_xor7_i8_true(i8 %x) {
;
; X64-FASTLZCNT-LABEL: ctlz_xor7_i8_true:
; X64-FASTLZCNT: # %bb.0:
-; X64-FASTLZCNT-NEXT: movzbl %dil, %eax
-; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT: addl $-24, %eax
+; X64-FASTLZCNT-NEXT: shll $24, %edi
+; X64-FASTLZCNT-NEXT: lzcntl %edi, %eax
; X64-FASTLZCNT-NEXT: xorb $7, %al
; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
; X64-FASTLZCNT-NEXT: retq
;
; X86-FASTLZCNT-LABEL: ctlz_xor7_i8_true:
; X86-FASTLZCNT: # %bb.0:
-; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-FASTLZCNT-NEXT: shll $24, %eax
; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT: addl $-24, %eax
; X86-FASTLZCNT-NEXT: xorb $7, %al
; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax
; X86-FASTLZCNT-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/lzcnt.ll b/llvm/test/CodeGen/X86/lzcnt.ll
index 68cef3f9363f9..b000401973416 100644
--- a/llvm/test/CodeGen/X86/lzcnt.ll
+++ b/llvm/test/CodeGen/X86/lzcnt.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=i686-- -mattr=+lzcnt | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mattr=+lzcnt | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-- -mattr=+lzcnt | FileCheck %s --check-prefix=X64
@@ -106,25 +106,23 @@ define i64 @t4(i64 %x) nounwind {
define i8 @t5(i8 %x) nounwind {
; X86-LABEL: t5:
; X86: # %bb.0:
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shll $24, %eax
; X86-NEXT: lzcntl %eax, %eax
-; X86-NEXT: addl $-24, %eax
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: retl
;
; X32-LABEL: t5:
; X32: # %bb.0:
-; X32-NEXT: movzbl %dil, %eax
-; X32-NEXT: lzcntl %eax, %eax
-; X32-NEXT: addl $-24, %eax
+; X32-NEXT: shll $24, %edi
+; X32-NEXT: lzcntl %edi, %eax
; X32-NEXT: # kill: def $al killed $al killed $eax
; X32-NEXT: retq
;
; X64-LABEL: t5:
; X64: # %bb.0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: lzcntl %eax, %eax
-; X64-NEXT: addl $-24, %eax
+; X64-NEXT: shll $24, %edi
+; X64-NEXT: lzcntl %edi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 true )
diff --git a/llvm/test/CodeGen/X86/pr38539.ll b/llvm/test/CodeGen/X86/pr38539.ll
index fbc363f77ec42..e710d3f95e6f4 100644
--- a/llvm/test/CodeGen/X86/pr38539.ll
+++ b/llvm/test/CodeGen/X86/pr38539.ll
@@ -22,79 +22,84 @@ define void @f() nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $160, %esp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl $176, %esp
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movzbl (%eax), %eax
; X86-NEXT: movzbl (%eax), %ecx
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: divb %cl
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: shll $30, %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: sarl $30, %ecx
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: xorl %eax, %edx
; X86-NEXT: xorl %eax, %edi
+; X86-NEXT: xorl %eax, %edx
; X86-NEXT: shrdl $1, %eax, %ecx
; X86-NEXT: xorl %ecx, %esi
; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: andl $3, %edx
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: sbbl %eax, %edi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: shldl $30, %edx, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $30, %esi, %edx
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: jne .LBB0_1
; X86-NEXT: # %bb.2: # %BB_udiv-special-cases
-; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: bsrl %edx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: addl $32, %eax
; X86-NEXT: jmp .LBB0_3
; X86-NEXT: .LBB0_1:
-; X86-NEXT: bsrl %edi, %eax
+; X86-NEXT: bsrl %ecx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: .LBB0_3: # %BB_udiv-special-cases
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testl %edx, %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll $30, %esi
; X86-NEXT: jne .LBB0_4
; X86-NEXT: # %bb.5: # %BB_udiv-special-cases
-; X86-NEXT: addl $64, %eax
-; X86-NEXT: jmp .LBB0_6
+; X86-NEXT: movl $64, %esi
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: je .LBB0_7
+; X86-NEXT: jmp .LBB0_8
; X86-NEXT: .LBB0_4:
-; X86-NEXT: bsrl %edx, %eax
-; X86-NEXT: xorl $31, %eax
-; X86-NEXT: addl $32, %eax
-; X86-NEXT: .LBB0_6: # %BB_udiv-special-cases
-; X86-NEXT: subl $62, %eax
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %ebx, %ebx
-; X86-NEXT: sbbl %ecx, %ecx
+; X86-NEXT: bsrl %esi, %esi
+; X86-NEXT: xorl $31, %esi
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: jne .LBB0_8
+; X86-NEXT: .LBB0_7: # %BB_udiv-special-cases
+; X86-NEXT: addl $64, %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: .LBB0_8: # %BB_udiv-special-cases
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
; X86-NEXT: addl $-66, %eax
+; X86-NEXT: movl $0, %ebx
; X86-NEXT: adcl $-1, %ebx
-; X86-NEXT: adcl $3, %ecx
-; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: adcl $3, %edx
; X86-NEXT: movb $1, %cl
; X86-NEXT: testb %cl, %cl
-; X86-NEXT: jne .LBB0_11
-; X86-NEXT: # %bb.7: # %BB_udiv-special-cases
-; X86-NEXT: andl $3, %esi
+; X86-NEXT: jne .LBB0_14
+; X86-NEXT: # %bb.9: # %BB_udiv-special-cases
+; X86-NEXT: andl $3, %edx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorl $65, %ecx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: orl %ebx, %ecx
-; X86-NEXT: je .LBB0_11
-; X86-NEXT: # %bb.8: # %udiv-bb1
+; X86-NEXT: je .LBB0_14
+; X86-NEXT: # %bb.10: # %udiv-bb1
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl $1, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: andl $3, %ebx
+; X86-NEXT: adcl $0, %edx
+; X86-NEXT: andl $3, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movb $65, %cl
; X86-NEXT: subb %al, %cl
; X86-NEXT: movb %cl, %ch
@@ -102,7 +107,7 @@ define void @f() nounwind {
; X86-NEXT: shrb $3, %cl
; X86-NEXT: andb $15, %cl
; X86-NEXT: negb %cl
-; X86-NEXT: movsbl %cl, %eax
+; X86-NEXT: movsbl %cl, %esi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -112,28 +117,27 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esp,%eax), %edi
+; X86-NEXT: movl 136(%esp,%esi), %edi
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shll %cl, %edi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 112(%esp,%eax), %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: shrl %eax
-; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: movl 128(%esp,%esi), %ebx
+; X86-NEXT: movl 132(%esp,%esi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shrl %esi
+; X86-NEXT: shrl %cl, %esi
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: je .LBB0_11
-; X86-NEXT: # %bb.9: # %udiv-preheader
-; X86-NEXT: orl %eax, %edi
+; X86-NEXT: je .LBB0_13
+; X86-NEXT: # %bb.11: # %udiv-preheader
+; X86-NEXT: andl $3, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: andl $3, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -150,19 +154,19 @@ define void @f() nounwind {
; X86-NEXT: # kill: def $al killed $al killed $eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $15, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 64(%esp,%eax), %edi
-; X86-NEXT: movl 68(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movzbl %al, %edx
+; X86-NEXT: movl 80(%esp,%edx), %edi
+; X86-NEXT: movl 84(%esp,%edx), %eax
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movb %ch, %cl
; X86-NEXT: shrl %cl, %esi
; X86-NEXT: notb %cl
-; X86-NEXT: movl 72(%esp,%eax), %ebx
+; X86-NEXT: movl 88(%esp,%edx), %ebx
; X86-NEXT: addl %ebx, %ebx
; X86-NEXT: shll %cl, %ebx
; X86-NEXT: orl %esi, %ebx
; X86-NEXT: movb %ch, %cl
-; X86-NEXT: shrdl %cl, %edx, %edi
+; X86-NEXT: shrdl %cl, %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -178,51 +182,52 @@ define void @f() nounwind {
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB0_10: # %udiv-do-while
+; X86-NEXT: .LBB0_12: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %ebx, %edi
; X86-NEXT: shldl $1, %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: andl $2, %edx
-; X86-NEXT: shrl %edx
-; X86-NEXT: leal (%edx,%ebx,2), %ebx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl $2, %eax
+; X86-NEXT: shrl %eax
+; X86-NEXT: leal (%eax,%edi,2), %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shldl $1, %edx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl $1, %eax, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %eax, %eax
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl $3, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl %edi, %edx
+; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: sbbl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: sbbl %ecx, %esi
; X86-NEXT: shll $30, %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: sarl $30, %edx
-; X86-NEXT: sarl $31, %esi
-; X86-NEXT: shrdl $1, %esi, %edx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl $1, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %esi, %eax
+; X86-NEXT: sarl $30, %eax
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: shrdl $1, %esi, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: subl %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %ebx
+; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: andl $3, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
@@ -236,12 +241,13 @@ define void @f() nounwind {
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edx, %eax
-; X86-NEXT: jne .LBB0_10
-; X86-NEXT: .LBB0_11: # %udiv-end
+; X86-NEXT: jne .LBB0_12
+; X86-NEXT: .LBB0_13: # %udiv-loop-exit
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: .LBB0_14: # %udiv-end
; X86-NEXT: cmpb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
; X86-NEXT: setne (%eax)
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%eax)
+; X86-NEXT: movl %esi, (%eax)
; X86-NEXT: movb $0, (%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index 831d7e6292e33..0932938b209a4 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -610,10 +610,10 @@ TEST_F(AArch64GISelMITest, WidenBitCountingCTLZZeroUndef) {
auto CheckStr = R"(
CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC
CHECK: [[Zext:%[0-9]+]]:_(s16) = G_ZEXT [[Trunc]]
- CHECK: [[CtlzZu:%[0-9]+]]:_(s16) = G_CTLZ_ZERO_UNDEF [[Zext]]
CHECK: [[Cst8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
- CHECK: [[Sub:%[0-9]+]]:_(s16) = G_SUB [[CtlzZu]]:_, [[Cst8]]:_
- CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC [[Sub]]
+ CHECK: [[Shl:%[0-9]+]]:_(s16) = G_SHL [[Zext]]:_, [[Cst8]]:_
+ CHECK: [[CtlzZu:%[0-9]+]]:_(s16) = G_CTLZ_ZERO_UNDEF [[Shl]]
+ CHECK: [[Trunc:%[0-9]+]]:_(s8) = G_TRUNC [[CtlzZu]]
)";
// Check
More information about the llvm-commits
mailing list