[llvm] Optimize count leading ones if promoted type (PR #99591)

Thu Jul 18 21:29:47 PDT 2024

https://github.com/v01dXYZ updated https://github.com/llvm/llvm-project/pull/99591

>From 27075abea73c78ee7e78cd9bfe32e719e7a63d1b Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Tue, 9 Jul 2024 16:42:39 +0200
Subject: [PATCH 1/4] Optimisation for count leading ones when promotion is
 necessary

(CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT Op ShiftAmount) -1))
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  20 +++-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  36 +++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  45 ++++++++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  38 +++++++
 llvm/test/CodeGen/AArch64/ctlo.ll             | 100 ++++++++++++++++++
 llvm/test/CodeGen/X86/ctlo.ll                 |  26 ++---
 6 files changed, 249 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ctlo.ll

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22d0708f54786..9a019b42a60ac 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2298,8 +2298,26 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
   if (match(CountZeros->getOperand(1), m_One()))
     return false;
 
-  // If it's cheap to speculate, there's nothing to do.
   Type *Ty = CountZeros->getType();
+  EVT VTy = TLI->getValueType(*DL, Ty);
+
+  // do not despeculate if we have (ctlz (xor -1 op)) if the operand is
+  // promoted as legalisation would later transform to:
+  //
+  // (ctlz (lshift (xor -1 (extend op))
+  //               lshiftamount))
+  //
+  // Despeculation is not only useless but also not wanted with SelectionDAG
+  // as XOR and CTLZ would be in different basic blocks.
+  if (ConstantInt * C;
+      (TLI->getTypeConversion(CountZeros->getContext(), VTy).first ==
+           TargetLowering::TypePromoteInteger ||
+       TLI->getOperationAction(ISD::CTLZ, VTy) == TargetLowering::Promote) &&
+      match(CountZeros->getOperand(0), m_Xor(m_Value(), m_ConstantInt(C))) &&
+      C->isMinusOne())
+    return false;
+
+  // If it's cheap to speculate, there's nothing to do.
   auto IntrinsicID = CountZeros->getIntrinsicID();
   if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
       (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3f1094e0ac703..a334b14e0cc8b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2356,6 +2356,35 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+static bool extendCtlzNot(const MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+                          MachineRegisterInfo &MRI, LLT WideTy) {
+  Register XorSrc;
+  Register CstReg;
+  if (!mi_match(MI.getOperand(1).getReg(), MRI,
+                m_GXor(m_Reg(XorSrc), m_Reg(CstReg))))
+    return false;
+
+  auto OptCst = getIConstantVRegValWithLookThrough(CstReg, MRI);
+  APInt Cst = OptCst->Value;
+
+  if (!Cst.isAllOnes())
+    return false;
+
+  auto AllOnes = MIRBuilder.buildConstant(
+      WideTy, APInt::getAllOnes(WideTy.getSizeInBits()));
+  auto Res = MIRBuilder.buildAnyExt(WideTy, XorSrc);
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT CurTy = MRI.getType(SrcReg);
+  unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
+  Res = MIRBuilder.buildShl(WideTy, Res,
+                            MIRBuilder.buildConstant(WideTy, SizeDiff));
+  Res = MIRBuilder.buildXor(WideTy, Res, AllOnes);
+  Res = MIRBuilder.buildCTLZ_ZERO_UNDEF(MI.getOperand(0), Res);
+
+  return true;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   switch (MI.getOpcode()) {
@@ -2449,6 +2478,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
     LLT CurTy = MRI.getType(SrcReg);
     unsigned NewOpc = MI.getOpcode();
+
+    if ((MI.getOpcode() == TargetOpcode::G_CTLZ ||
+         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) &&
+        extendCtlzNot(MI, MIRBuilder, MRI, WideTy)) {
+      MI.eraseFromParent();
+      return Legalized;
+    }
     if (NewOpc == TargetOpcode::G_CTTZ) {
       // The count is the same in the larger type except if the original
       // value was zero.  This can be handled by setting the bit just off
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d6a0dd9ae9b20..f062ad2543dd9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5049,6 +5049,40 @@ static MVT getPromotedVectorElementType(const TargetLowering &TLI,
   return MidVT;
 }
 
+// (CTLZ (XOR Op -1)) --> (TRUNCATE (CTLZ_ZERO_UNDEF
+//                                    (XOR (SHIFT (ANYEXTEND Op1)
+//                                                ShiftAmount)
+//                                         -1)))
+static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, MVT OVT,
+                          MVT NVT, SelectionDAG &DAG) {
+  SDValue NotOp = Node->getOperand(0);
+  if (NotOp.getOpcode() != ISD::XOR)
+    return false;
+
+  SDValue SrcOp = NotOp->getOperand(0);
+  SDValue CstOp = NotOp->getOperand(1);
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOp);
+
+  if (!Cst || !Cst->isAllOnes())
+    return false;
+
+  auto ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+  unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  auto ShiftConst =
+      DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+  SDValue NSrcOp = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+
+  SDValue NCstOp =
+      DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
+
+  Result = DAG.getNode(NotOp->getOpcode(), dl, NVT, NSrcOp, NCstOp,
+                       NotOp->getFlags());
+  Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Result);
+  Result = DAG.getNode(ISD::TRUNCATE, dl, OVT, Result);
+  return true;
+}
+
 void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to promote node\n");
   SmallVector<SDValue, 8> Results;
@@ -5084,6 +5118,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTLZ:
   case ISD::CTPOP: {
+    // If the operand of CTLZ is NOT, push the extend in the NOT.
+    if (Node->getOpcode() == ISD::CTLZ &&
+        ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
+      Results.push_back(Tmp1);
+      break;
+    }
+
     // Zero extend the argument unless its cttz, then use any_extend.
     if (Node->getOpcode() == ISD::CTTZ ||
         Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
@@ -5115,6 +5156,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   }
   case ISD::CTLZ_ZERO_UNDEF: {
+    if (ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
+      Results.push_back(Tmp1);
+      break;
+    }
     // We know that the argument is unlikely to be zero, hence we can take a
     // different approach as compared to ISD::CTLZ
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fed5ebcc3c903..58e519c8657e0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -638,6 +638,37 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   return Result;
 }
 
+// (CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op1)
+//                                                     ShiftAmount)
+//                                               -1))
+static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, EVT OVT,
+                          EVT NVT, SelectionDAG &DAG) {
+  SDValue NotOp = Node->getOperand(0);
+  if (NotOp.getOpcode() != ISD::XOR)
+    return false;
+
+  SDValue SrcOp = NotOp->getOperand(0);
+  SDValue CstOp = NotOp->getOperand(1);
+
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOp);
+  if (!Cst || !Cst->isAllOnes())
+    return false;
+
+  auto ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+  unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  auto ShiftConst =
+      DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+  SDValue NSrcOp = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+
+  SDValue NCstOp =
+      DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
+
+  Result = DAG.getNode(NotOp->getOpcode(), dl, NVT, NSrcOp, NCstOp,
+                       NotOp->getFlags());
+  Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Result);
+  return true;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   EVT OVT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
@@ -656,6 +687,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   }
 
   unsigned CtlzOpcode = N->getOpcode();
+  // If the operand of CTLZ is NOT, push the extend in the NOT.
+  if (SDValue Res;
+      (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::CTLZ_ZERO_UNDEF) &&
+      ExtendCtlzNot(N, Res, dl, OVT, NVT, DAG)) {
+    return Res;
+  }
+
   if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
     // Subtract off the extra leading bits in the bigger type.
     SDValue ExtractLeadingBits = DAG.getConstant(
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 0000000000000..5f15f540f458d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-LABEL: ctlo_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #24
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i8 %x, -1
+  %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+  ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-LABEL: ctlo_i8_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #24
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i8 %x, -1
+  %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+  ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-LABEL: ctlo_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #16
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i16 %x, -1
+  %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+  ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-LABEL: ctlo_i16_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    eor w8, w8, w0, lsl #16
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i16 %x, -1
+  %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+  ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+  ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn w8, w0
+; CHECK-NEXT:    clz w0, w8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+  ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    clz x0, x8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i64 %x, -1
+  %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+  ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvn x8, x0
+; CHECK-NEXT:    clz x0, x8
+; CHECK-NEXT:    ret
+  %tmp1 = xor i64 %x, -1
+  %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+  ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 7431f94f0fdf2..020d6d1b80136 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
 ;
 ; X86-CLZ-LABEL: ctlo_i8:
 ; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    notb %al
-; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    shll $24, %eax
+; X86-CLZ-NEXT:    notl %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: ctlo_i8:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    notb %dil
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    shll $24, %edi
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntl %edi, %eax
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = xor i8 %x, -1
@@ -89,20 +87,18 @@ define i8 @ctlo_i8_undef(i8 %x) {
 ;
 ; X86-CLZ-LABEL: ctlo_i8_undef:
 ; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    notb %al
-; X86-CLZ-NEXT:    movzbl %al, %eax
+; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    shll $24, %eax
+; X86-CLZ-NEXT:    notl %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: ctlo_i8_undef:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    notb %dil
-; X64-CLZ-NEXT:    movzbl %dil, %eax
-; X64-CLZ-NEXT:    shll $24, %eax
-; X64-CLZ-NEXT:    lzcntl %eax, %eax
+; X64-CLZ-NEXT:    shll $24, %edi
+; X64-CLZ-NEXT:    notl %edi
+; X64-CLZ-NEXT:    lzcntl %edi, %eax
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = xor i8 %x, -1

>From c939054f9deac10e4bb5a5a47f21d91572dbf000 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Thu, 18 Jul 2024 23:32:11 +0200
Subject: [PATCH 2/4] (XFAIL) Make AMDGPU uses SelectDAG Legaliser for CTLZ

LegaliseType does not allow to override promotion for a pair of
operation and type.

the failing tests is CodeGen/AMDGPU/ctlz_zero_undef.ll.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 25 ++++++++++++++++---
 .../CodeGen/SelectionDAG/LegalizeTypes.cpp    | 11 +++++---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  2 +-
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 58e519c8657e0..5a0379a4dce9d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -674,6 +674,17 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
   SDLoc dl(N);
 
+  bool IsDefaultPromotion =
+      TLI.getOperationAction(N->getOpcode(), OVT) != TargetLowering::Promote;
+  if (!IsDefaultPromotion) {
+    EVT NVTPromote = TLI.getTypeToPromoteTo(N->getOpcode(), OVT.getSimpleVT());
+
+    if (NVT == NVTPromote)
+      IsDefaultPromotion = false;
+    else
+      NVT = NVTPromote;
+  }
+
   // If the larger CTLZ isn't supported by the target, try to expand now.
   // If we expand later we'll end up with more operations since we lost the
   // original type.
@@ -701,7 +712,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
 
     if (!N->isVPOpcode()) {
       // Zero extend to the promoted type and do the count there.
-      SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+      SDValue Op = IsDefaultPromotion
+                       ? ZExtPromotedInteger(N->getOperand(0))
+                       : DAG.getZExtOrTrunc(N->getOperand(0), dl, NVT);
+
       return DAG.getNode(ISD::SUB, dl, NVT,
                          DAG.getNode(N->getOpcode(), dl, NVT, Op),
                          ExtractLeadingBits);
@@ -709,7 +723,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
     SDValue Mask = N->getOperand(1);
     SDValue EVL = N->getOperand(2);
     // Zero extend to the promoted type and do the count there.
-    SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
+    SDValue Op =
+        IsDefaultPromotion
+            ? VPZExtPromotedInteger(N->getOperand(0), Mask, EVL)
+            : DAG.getVPZExtOrTrunc(dl, NVT, N->getOperand(0), Mask, EVL);
     return DAG.getNode(ISD::VP_SUB, dl, NVT,
                        DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
                        ExtractLeadingBits, Mask, EVL);
@@ -717,7 +734,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   if (CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
       CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) {
     // Any Extend the argument
-    SDValue Op = GetPromotedInteger(N->getOperand(0));
+    SDValue Op = IsDefaultPromotion
+                     ? GetPromotedInteger(N->getOperand(0))
+                     : DAG.getAnyExtOrTrunc(N->getOperand(0), dl, NVT);
     // Op = Op << (sizeinbits(NVT) - sizeinbits(Old VT))
     unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
     auto ShiftConst =
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cb6d3fe4db8a4..699c45dfa5a4d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -705,9 +705,14 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
 }
 
 void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
-  assert(Result.getValueType() ==
-         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
-         "Invalid type for promoted integer");
+  assert(
+      Result.getValueType() ==
+          ((TLI.getOperationAction(Op.getOpcode(), Op.getValueType()) !=
+            TargetLowering::Promote)
+               ? TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())
+               : TLI.getTypeToPromoteTo(Op.getOpcode(),
+                                        Op.getSimpleValueType())) &&
+      "Invalid type for promoted integer");
   AnalyzeNewValue(Result);
 
   auto &OpIdEntry = PromotedIntegers[getTableId(Op)];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ef30bf6d993fa..5a98329cea8c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -498,7 +498,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       MVT::i64, Custom);
 
   for (auto VT : {MVT::i8, MVT::i16})
-    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
+    setOperationPromotedToType({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, MVT::i32);
 
   static const MVT::SimpleValueType VectorIntTypes[] = {
       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,

>From 04e67386d1fd68870b5cbd424c123ddc75f01387 Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Fri, 19 Jul 2024 02:12:17 +0200
Subject: [PATCH 3/4] Revert "(XFAIL) Make AMDGPU uses SelectDAG Legaliser for
 CTLZ"

This reverts commit 0a4ba146fd84a05489cdd2c91cbc59297bc8a03b.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 25 +++----------------
 .../CodeGen/SelectionDAG/LegalizeTypes.cpp    | 11 +++-----
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  2 +-
 3 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 5a0379a4dce9d..58e519c8657e0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -674,17 +674,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
   SDLoc dl(N);
 
-  bool IsDefaultPromotion =
-      TLI.getOperationAction(N->getOpcode(), OVT) != TargetLowering::Promote;
-  if (!IsDefaultPromotion) {
-    EVT NVTPromote = TLI.getTypeToPromoteTo(N->getOpcode(), OVT.getSimpleVT());
-
-    if (NVT == NVTPromote)
-      IsDefaultPromotion = false;
-    else
-      NVT = NVTPromote;
-  }
-
   // If the larger CTLZ isn't supported by the target, try to expand now.
   // If we expand later we'll end up with more operations since we lost the
   // original type.
@@ -712,10 +701,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
 
     if (!N->isVPOpcode()) {
       // Zero extend to the promoted type and do the count there.
-      SDValue Op = IsDefaultPromotion
-                       ? ZExtPromotedInteger(N->getOperand(0))
-                       : DAG.getZExtOrTrunc(N->getOperand(0), dl, NVT);
-
+      SDValue Op = ZExtPromotedInteger(N->getOperand(0));
       return DAG.getNode(ISD::SUB, dl, NVT,
                          DAG.getNode(N->getOpcode(), dl, NVT, Op),
                          ExtractLeadingBits);
@@ -723,10 +709,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
     SDValue Mask = N->getOperand(1);
     SDValue EVL = N->getOperand(2);
     // Zero extend to the promoted type and do the count there.
-    SDValue Op =
-        IsDefaultPromotion
-            ? VPZExtPromotedInteger(N->getOperand(0), Mask, EVL)
-            : DAG.getVPZExtOrTrunc(dl, NVT, N->getOperand(0), Mask, EVL);
+    SDValue Op = VPZExtPromotedInteger(N->getOperand(0), Mask, EVL);
     return DAG.getNode(ISD::VP_SUB, dl, NVT,
                        DAG.getNode(N->getOpcode(), dl, NVT, Op, Mask, EVL),
                        ExtractLeadingBits, Mask, EVL);
@@ -734,9 +717,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   if (CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
       CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) {
     // Any Extend the argument
-    SDValue Op = IsDefaultPromotion
-                     ? GetPromotedInteger(N->getOperand(0))
-                     : DAG.getAnyExtOrTrunc(N->getOperand(0), dl, NVT);
+    SDValue Op = GetPromotedInteger(N->getOperand(0));
     // Op = Op << (sizeinbits(NVT) - sizeinbits(Old VT))
     unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
     auto ShiftConst =
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 699c45dfa5a4d..cb6d3fe4db8a4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -705,14 +705,9 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
 }
 
 void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
-  assert(
-      Result.getValueType() ==
-          ((TLI.getOperationAction(Op.getOpcode(), Op.getValueType()) !=
-            TargetLowering::Promote)
-               ? TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())
-               : TLI.getTypeToPromoteTo(Op.getOpcode(),
-                                        Op.getSimpleValueType())) &&
-      "Invalid type for promoted integer");
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted integer");
   AnalyzeNewValue(Result);
 
   auto &OpIdEntry = PromotedIntegers[getTableId(Op)];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5a98329cea8c9..ef30bf6d993fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -498,7 +498,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       MVT::i64, Custom);
 
   for (auto VT : {MVT::i8, MVT::i16})
-    setOperationPromotedToType({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, MVT::i32);
+    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
 
   static const MVT::SimpleValueType VectorIntTypes[] = {
       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,

>From 7b9d5b77df71ff868ad44e8002aa9d2afa381ebe Mon Sep 17 00:00:00 2001
From: v01dxyz <v01dxyz at v01d.xyz>
Date: Fri, 19 Jul 2024 06:29:14 +0200
Subject: [PATCH 4/4] Update LoongArch ctlz tests

---
 .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 24 +++++++------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f323..e993ecfcdf3b8 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 define i8 @test_not_ctlz_i8(i8 %a) nounwind {
 ; LA32-LABEL: test_not_ctlz_i8:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    ori $a1, $zero, 255
-; LA32-NEXT:    andn $a0, $a1, $a0
-; LA32-NEXT:    clz.w $a0, $a0
-; LA32-NEXT:    addi.w $a0, $a0, -24
+; LA32-NEXT:    slli.w $a0, $a0, 24
+; LA32-NEXT:    clo.w $a0, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_not_ctlz_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    ori $a1, $zero, 255
-; LA64-NEXT:    andn $a0, $a1, $a0
-; LA64-NEXT:    clz.d $a0, $a0
-; LA64-NEXT:    addi.d $a0, $a0, -56
+; LA64-NEXT:    slli.d $a0, $a0, 56
+; LA64-NEXT:    clo.d $a0, $a0
 ; LA64-NEXT:    ret
   %neg = xor i8 %a, -1
   %tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
 define i16 @test_not_ctlz_i16(i16 %a) nounwind {
 ; LA32-LABEL: test_not_ctlz_i16:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    nor $a0, $a0, $zero
-; LA32-NEXT:    bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT:    clz.w $a0, $a0
-; LA32-NEXT:    addi.w $a0, $a0, -16
+; LA32-NEXT:    slli.w $a0, $a0, 16
+; LA32-NEXT:    clo.w $a0, $a0
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test_not_ctlz_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    nor $a0, $a0, $zero
-; LA64-NEXT:    bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT:    clz.d $a0, $a0
-; LA64-NEXT:    addi.d $a0, $a0, -48
+; LA64-NEXT:    slli.d $a0, $a0, 48
+; LA64-NEXT:    clo.d $a0, $a0
 ; LA64-NEXT:    ret
   %neg = xor i16 %a, -1
   %tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)