[llvm] Optimize count leading ones if promoted type (PR #99591)

Wed Jul 24 03:15:39 PDT 2024

https://github.com/v01dXYZ updated https://github.com/llvm/llvm-project/pull/99591

>From 48483829ea41e228f27b7e12e387c21e70d921f8 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 23 Jul 2024 15:36:19 +0200
Subject: [PATCH 1/2] [CodeGenPrepare] Do not despeculate count
 leading/trailing ones if promotion

For count leading/trailing ones, ie (CTLZ/CTTZ (XOR Op -1)),
legalisation should be able to optimise this case when a promotion is
necessary.

Despeculation should not be applied in this case as it will separate
XOR and CTLZ/CTTZ in two different basic blocks. This is particularly
problematic with SelectionDAG.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22d0708f54786..a3dd712a6db83 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2310,6 +2310,26 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
   if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
     return false;
 
+  // Do not despeculate if we have (ctlz/cttz (xor op -1)) if the operand is
+  // promoted as legalisation should be later able to transform it to:
+  //
+  // ctlz:
+  // (ctlz_zero_undef (lshift (xor (extend op) -1)
+  //                          lshiftamount))
+  //
+  // cttz:
+  // (cttz_zero_undef (xor (zeroextend op) -1))
+  //
+  // Despeculation is not only useless but also not wanted with SelectionDAG
+  // as XOR and CTLZ/CTTZ would be in different basic blocks.
+  EVT VTy = TLI->getValueType(*DL, Ty);
+  int ISDOpcode = IntrinsicID == Intrinsic::ctlz ? ISD::CTLZ : ISD::CTTZ;
+  if (match(CountZeros->getOperand(0), m_Not(m_Value())) &&
+      (TLI->getTypeAction(CountZeros->getContext(), VTy) ==
+           TargetLowering::TypePromoteInteger ||
+       TLI->getOperationAction(ISDOpcode, VTy) == TargetLowering::Promote))
+    return false;
+
   // Bail if the value is never zero.
   Use &Op = CountZeros->getOperandUse(0);
   if (isKnownNonZero(Op, *DL))

>From b6e18dbf734885476c8e68c62a21bee7ca95c6eb Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 9 Jul 2024 16:42:39 +0200
Subject: [PATCH 2/2] [CodeGen] Legalisation with promotion: optimise count
 leading ones

(CTLZ (XOR Op -1))
  -->
(CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op) ShiftAmount) -1))

The optimisation also applies for CTLZ_ZERO_UNDEF, VP_CTLZ, VP_CTLZ_ZERO_UNDEF.

Fixes https://github.com/llvm/llvm-project/issues/96455
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  26 +++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  34 ++++++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  43 ++++++++
 llvm/test/CodeGen/AArch64/ctlo.ll             | 100 ++++++++++++++++++
 .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll |  24 ++---
 llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll        |  52 +++++++++
 llvm/test/CodeGen/X86/ctlo.ll                 |  26 ++---
 7 files changed, 274 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ctlo.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3f1094e0ac703..882f2a263f3cd 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2356,6 +2356,25 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+static bool extendCtlzNot(const MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+                          MachineRegisterInfo &MRI, LLT WideTy) {
+  Register Src;
+  if (!mi_match(MI.getOperand(1).getReg(), MRI, m_Not(m_Reg(Src))))
+    return false;
+
+  auto ExtSrc = MIRBuilder.buildAnyExt(WideTy, Src);
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT CurTy = MRI.getType(SrcReg);
+  unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
+  auto LShift = MIRBuilder.buildShl(WideTy, ExtSrc,
+                                    MIRBuilder.buildConstant(WideTy, SizeDiff));
+  auto Not = MIRBuilder.buildNot(WideTy, LShift);
+  MIRBuilder.buildCTLZ_ZERO_UNDEF(MI.getOperand(0), Not);
+
+  return true;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   switch (MI.getOpcode()) {
@@ -2449,6 +2468,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
     LLT CurTy = MRI.getType(SrcReg);
     unsigned NewOpc = MI.getOpcode();
+
+    if ((MI.getOpcode() == TargetOpcode::G_CTLZ ||
+         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) &&
+        extendCtlzNot(MI, MIRBuilder, MRI, WideTy)) {
+      MI.eraseFromParent();
+      return Legalized;
+    }
     if (NewOpc == TargetOpcode::G_CTTZ) {
       // The count is the same in the larger type except if the original
       // value was zero.  This can be handled by setting the bit just off
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d6a0dd9ae9b20..5d5656b572e2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
@@ -54,6 +55,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace llvm::SDPatternMatch;
 
 #define DEBUG_TYPE "legalizedag"
 
@@ -5049,6 +5051,27 @@ static MVT getPromotedVectorElementType(const TargetLowering &TLI,
   return MidVT;
 }
 
+// (CTLZ (XOR Op -1)) --> (TRUNCATE (CTLZ_ZERO_UNDEF
+//                                    (XOR (SHIFT (ANYEXTEND Op1)
+//                                                ShiftAmount)
+//                                         -1)))
+static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, MVT OVT,
+                          MVT NVT, SelectionDAG &DAG) {
+  SDValue SrcOp;
+  if (!sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp))))
+    return false;
+
+  SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+  unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  SDValue ShiftConst =
+      DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+  SDValue LShift = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+  SDValue Not = DAG.getNOT(dl, LShift, NVT);
+  SDValue Ctlz = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Not);
+  Result = DAG.getNode(ISD::TRUNCATE, dl, OVT, Ctlz);
+  return true;
+}
+
 void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to promote node\n");
   SmallVector<SDValue, 8> Results;
@@ -5084,6 +5107,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTLZ:
   case ISD::CTPOP: {
+    // If the operand of CTLZ is NOT, push the extend in the NOT.
+    if (Node->getOpcode() == ISD::CTLZ &&
+        ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
+      Results.push_back(Tmp1);
+      break;
+    }
+
     // Zero extend the argument unless its cttz, then use any_extend.
     if (Node->getOpcode() == ISD::CTTZ ||
         Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
@@ -5115,6 +5145,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   }
   case ISD::CTLZ_ZERO_UNDEF: {
+    if (ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
+      Results.push_back(Tmp1);
+      break;
+    }
     // We know that the argument is unlikely to be zero, hence we can take a
     // different approach as compared to ISD::CTLZ
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fed5ebcc3c903..b013aa364e523 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -19,6 +19,7 @@
 
 #include "LegalizeTypes.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -27,6 +28,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
+using namespace llvm::SDPatternMatch;
 
 #define DEBUG_TYPE "legalize-types"
 
@@ -638,6 +640,39 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   return Result;
 }
 
+// (CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op1)
+//                                                     ShiftAmount)
+//                                               -1))
+static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, EVT OVT,
+                          EVT NVT, SelectionDAG &DAG) {
+  SDValue SrcOp;
+  if (!sd_match(Node->getOperand(0), m_Not(m_Value(SrcOp))))
+    return false;
+
+  SDValue ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+  unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  SDValue ShiftConst =
+      DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+
+  SDValue NCstOp =
+      DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
+  if (!Node->isVPOpcode()) {
+    SDValue LShift = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+    SDValue Not = DAG.getNOT(dl, LShift, NVT);
+    Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Not);
+  } else {
+    SDValue Mask = Node->getOperand(1);
+    SDValue EVL = Node->getOperand(2);
+
+    SDValue LShift =
+        DAG.getNode(ISD::VP_SHL, dl, NVT, ExtSrc, ShiftConst, Mask, EVL);
+    SDValue Not = DAG.getNode(ISD::VP_XOR, dl, NVT, LShift, NCstOp, Mask, EVL);
+    Result = DAG.getNode(ISD::VP_CTLZ_ZERO_UNDEF, dl, NVT, Not, Mask, EVL);
+  }
+
+  return true;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   EVT OVT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
@@ -656,6 +691,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   }
 
   unsigned CtlzOpcode = N->getOpcode();
+  // If the operand of CTLZ is NOT, push the extend in the NOT.
+  if (SDValue Res;
+      (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
+       CtlzOpcode == ISD::VP_CTLZ || CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) &&
+      ExtendCtlzNot(N, Res, dl, OVT, NVT, DAG)) {
+    return Res;
+  }
+
   if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
     // Subtract off the extra leading bits in the bigger type.
     SDValue ExtractLeadingBits = DAG.getConstant(
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 0000000000000..5f15f540f458d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-LABEL: ctlo_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #24
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i8 %x, -1
+  %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+  ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-LABEL: ctlo_i8_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #24
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i8 %x, -1
+  %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+  ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-LABEL: ctlo_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #16
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i16 %x, -1
+  %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+  ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-LABEL: ctlo_i16_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #16
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i16 %x, -1
+  %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+  ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+  ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+  ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    clz x0, x8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i64 %x, -1
+  %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+  ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    clz x0, x8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i64 %x, -1
+  %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+  ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f323..e993ecfcdf3b8 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 define i8 @test_not_ctlz_i8(i8 %a) nounwind {
 ; LA32-LABEL: test_not_ctlz_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    ori $a1, $zero, 255
-; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    clz.w $a0, $a0
-; LA32-NEXT:    addi.w $a0, $a0, -24
+; LA32-NEXT:    slli.w $a0, $a0, 24
+; LA32-NEXT:    clo.w $a0, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_not_ctlz_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a1, $zero, 255
-; LA64-NEXT:    andn $a0, $a1, $a0
-; LA64-NEXT:    clz.d $a0, $a0
-; LA64-NEXT:    addi.d $a0, $a0, -56
+; LA64-NEXT:    slli.d $a0, $a0, 56
+; LA64-NEXT:    clo.d $a0, $a0
 ; LA64-NEXT:    ret
   %neg = xor i8 %a, -1
   %tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
 define i16 @test_not_ctlz_i16(i16 %a) nounwind {
 ; LA32-LABEL: test_not_ctlz_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    nor $a0, $a0, $zero
-; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT:    clz.w $a0, $a0
-; LA32-NEXT:    addi.w $a0, $a0, -16
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    clo.w $a0, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_not_ctlz_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    nor $a0, $a0, $zero
-; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT:    clz.d $a0, $a0
-; LA64-NEXT:    addi.d $a0, $a0, -48
+; LA64-NEXT:    slli.d $a0, $a0, 48
+; LA64-NEXT:    clo.d $a0, $a0
 ; LA64-NEXT:    ret
   %neg = xor i16 %a, -1
   %tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 58882525e55c4..6f89489bb39d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2624,6 +2624,58 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
   %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i9> %v
 }
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    ret
+  %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+  %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i9> %v
+}
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK-ZVBB:       # %bb.0:
+; CHECK-ZVBB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT:    ret
+  %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+  %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i9> %v
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; RV32: {{.*}}
 ; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 7431f94f0fdf2..020d6d1b80136 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
 ;
 ; X86-CLZ-LABEL: ctlo_i8:
 ; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    notb %al
-; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    shll $24, %eax
+; X86-CLZ-NEXT:    notl %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: ctlo_i8:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    notb %dil
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    shll $24, %edi
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntl %edi, %eax
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = xor i8 %x, -1
@@ -89,20 +87,18 @@ define i8 @ctlo_i8_undef(i8 %x) {
 ;
 ; X86-CLZ-LABEL: ctlo_i8_undef:
 ; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    notb %al
-; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    shll $24, %eax
+; X86-CLZ-NEXT:    notl %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: ctlo_i8_undef:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    notb %dil
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    shll $24, %eax
-; X64-CLZ-NEXT:    lzcntl %eax, %eax
+; X64-CLZ-NEXT:    shll $24, %edi
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntl %edi, %eax
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = xor i8 %x, -1