[llvm] [RISCV] Construct constants via instructions if materialization is costly (PR #86926)

Thu Mar 28 02:57:34 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Wang Pengcheng (wangpc-pp)

<details>
<summary>Changes</summary>

For RISCV, it is costly to materialize constants used in lowering
`ISD::CTPOP`/`ISD::VP_CTPOP`.

We can query the materialization cost via `RISCVMatInt::getIntMatCost`
and if the cost is larger than 2, we should construct the constant
via two instructions.

This fixes #86207.


---

Patch is 952.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86926.diff


21 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+195-6) 
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.h (+3) 
- (modified) llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll (+1090-518) 
- (modified) llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll (+63-44) 
- (modified) llvm/test/CodeGen/RISCV/pr56457.ll (+10-14) 
- (modified) llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll (+35-22) 
- (modified) llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll (+52-43) 
- (modified) llvm/test/CodeGen/RISCV/rv64xtheadbb.ll (+110-77) 
- (modified) llvm/test/CodeGen/RISCV/rv64zbb.ll (+211-172) 
- (modified) llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll (+324-340) 
- (modified) llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll (+180-192) 
- (modified) llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll (+1167-779) 
- (modified) llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll (+396-440) 
- (modified) llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll (+1246-888) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll (+2515-1871) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll (+116-164) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll (+1220-839) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll (+70-94) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll (+2605-1961) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll (+172-220) 
- (modified) llvm/test/CodeGen/RISCV/sextw-removal.ll (+29-29) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e6814c5f71a09b..031030990d4405 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -391,7 +391,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
     }
   } else if (!Subtarget.hasVendorXCVbitmanip()) {
-    setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
+    setOperationAction(ISD::CTTZ, XLenVT, Expand);
+    setOperationAction(ISD::CTPOP, XLenVT,
+                       Subtarget.is64Bit() ? Custom : Expand);
     if (RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
   }
@@ -901,11 +903,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                            VT, Custom);
       } else {
         setOperationAction({ISD::BITREVERSE, ISD::VP_BITREVERSE}, VT, Expand);
-        setOperationAction({ISD::CTLZ, ISD::CTTZ, ISD::CTPOP}, VT, Expand);
+        setOperationAction({ISD::CTLZ, ISD::CTTZ}, VT, Expand);
         setOperationAction({ISD::VP_CTLZ, ISD::VP_CTLZ_ZERO_UNDEF, ISD::VP_CTTZ,
-                            ISD::VP_CTTZ_ZERO_UNDEF, ISD::VP_CTPOP},
+                            ISD::VP_CTTZ_ZERO_UNDEF},
                            VT, Expand);
 
+        setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom);
+
         // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
         // range of f32.
         EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
@@ -1238,6 +1242,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                               ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTPOP},
                              VT, Custom);
         } else {
+          setOperationAction({ISD::CTPOP, ISD::VP_CTPOP}, VT, Custom);
           // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if element of VT in the
           // range of f32.
           EVT FloatVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
@@ -6746,8 +6751,18 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::UDIV:
   case ISD::UREM:
   case ISD::BSWAP:
-  case ISD::CTPOP:
     return lowerToScalableOp(Op, DAG);
+  case ISD::CTPOP: {
+    if (Op.getValueType().isScalarInteger())
+      return lowerScalarCTPOP(Op, DAG);
+    if (Subtarget.hasStdExtZvbb())
+      return lowerToScalableOp(Op, DAG);
+    return lowerVectorCTPOP(Op, DAG);
+  }
+  case ISD::VP_CTPOP:
+    if (Subtarget.hasStdExtZvbb())
+      return lowerVPOp(Op, DAG);
+    return lowerVectorCTPOP(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -6972,8 +6987,6 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     if (Subtarget.hasStdExtZvbb())
       return lowerVPOp(Op, DAG);
     return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG);
-  case ISD::VP_CTPOP:
-    return lowerVPOp(Op, DAG);
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     return lowerVPStridedLoad(Op, DAG);
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
@@ -10755,6 +10768,182 @@ SDValue RISCVTargetLowering::lowerABS(SDValue Op, SelectionDAG &DAG) const {
   return Max;
 }
 
+SDValue RISCVTargetLowering::lowerScalarCTPOP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MVT VT = Op.getSimpleValueType();
+  SDLoc DL(Op);
+  MVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()).getSimpleVT();
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "lowerScalarCTPOP not implemented for this type.");
+
+  SDValue V = Op.getOperand(0);
+
+  // This is same algorithm of TargetLowering::expandCTPOP from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  // 0x0F0F0F0F...
+  const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  SDValue Mask0F = DAG.getConstant(Constant0F, DL, VT, false, true);
+  // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+  const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
+  SDValue Mask33 =
+      RISCVMatInt::getIntMatCost(Constant33, VT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(ISD::XOR, DL, VT, Mask0F,
+                        DAG.getNode(ISD::SHL, DL, VT, Mask0F,
+                                    DAG.getShiftAmountConstant(2, VT, DL)))
+          : DAG.getConstant(Constant33, DL, VT);
+  // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+  const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+  SDValue Mask55 =
+      RISCVMatInt::getIntMatCost(Constant55, VT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(ISD::XOR, DL, VT, Mask33,
+                        DAG.getNode(ISD::SHL, DL, VT, Mask33,
+                                    DAG.getShiftAmountConstant(1, VT, DL)))
+          : DAG.getConstant(Constant55, DL, VT);
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  V = DAG.getNode(ISD::SUB, DL, VT, V,
+                  DAG.getNode(ISD::AND, DL, VT,
+                              DAG.getNode(ISD::SRL, DL, VT, V,
+                                          DAG.getConstant(1, DL, ShVT)),
+                              Mask55));
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  V = DAG.getNode(ISD::ADD, DL, VT, DAG.getNode(ISD::AND, DL, VT, V, Mask33),
+                  DAG.getNode(ISD::AND, DL, VT,
+                              DAG.getNode(ISD::SRL, DL, VT, V,
+                                          DAG.getConstant(2, DL, ShVT)),
+                              Mask33));
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  V = DAG.getNode(ISD::AND, DL, VT,
+                  DAG.getNode(ISD::ADD, DL, VT, V,
+                              DAG.getNode(ISD::SRL, DL, VT, V,
+                                          DAG.getConstant(4, DL, ShVT))),
+                  Mask0F);
+
+  // v = (v * 0x01010101...) >> (Len - 8)
+  // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+  const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
+  SDValue Mask01 =
+      RISCVMatInt::getIntMatCost(Constant01, VT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(ISD::AND, DL, VT, Mask0F,
+                        DAG.getNode(ISD::SRL, DL, VT, Mask0F,
+                                    DAG.getShiftAmountConstant(3, VT, DL)))
+          : DAG.getConstant(Constant01, DL, VT);
+  return DAG.getNode(ISD::SRL, DL, VT, DAG.getNode(ISD::MUL, DL, VT, V, Mask01),
+                     DAG.getConstant(Len - 8, DL, ShVT));
+}
+
+SDValue RISCVTargetLowering::lowerVectorCTPOP(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "lowerVectorCTPOP not implemented for this type.");
+
+  SDValue V = Op.getOperand(0);
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    V = convertToScalableVector(ContainerVT, V, DAG, Subtarget);
+  }
+
+  SDValue Mask, VL;
+  if (Op->getOpcode() == ISD::VP_CTPOP) {
+    Mask = Op->getOperand(1);
+    if (VT.isFixedLengthVector())
+      Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
+                                     Subtarget);
+    VL = Op->getOperand(2);
+  } else
+    std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+
+  // This is same algorithm of TargetLowering::expandVPCTPOP from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+  // 0x0F0F0F0F...
+  const APInt &Constant0F = APInt::getSplat(Len, APInt(8, 0x0F));
+  SDValue Mask0F = DAG.getConstant(Constant0F, DL, ContainerVT);
+  // 0x33333333... = (0x0F0F0F0F... ^ (0x0F0F0F0F... << 2))
+  const APInt &Constant33 = APInt::getSplat(Len, APInt(8, 0x33));
+  SDValue Mask33 =
+      RISCVMatInt::getIntMatCost(Constant33, ContainerVT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask0F,
+                        DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask0F,
+                                    DAG.getConstant(2, DL, ContainerVT),
+                                    DAG.getUNDEF(ContainerVT), Mask, VL),
+                        DAG.getUNDEF(ContainerVT), Mask, VL)
+          : DAG.getConstant(Constant33, DL, ContainerVT);
+  // 0x55555555... = (0x33333333... ^ (0x33333333... << 1))
+  const APInt &Constant55 = APInt::getSplat(Len, APInt(8, 0x55));
+  SDValue Mask55 =
+      RISCVMatInt::getIntMatCost(Constant55, ContainerVT.getScalarSizeInBits(),
+                                 Subtarget) > 2
+          ? DAG.getNode(RISCVISD::XOR_VL, DL, ContainerVT, Mask33,
+                        DAG.getNode(RISCVISD::SHL_VL, DL, ContainerVT, Mask33,
+                                    DAG.getConstant(1, DL, ContainerVT),
+                                    DAG.getUNDEF(ContainerVT), Mask, VL),
+                        DAG.getUNDEF(ContainerVT), Mask, VL)
+          : DAG.getConstant(Constant55, DL, ContainerVT);
+
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5;
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  Tmp1 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT,
+                     DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+                                 DAG.getConstant(1, DL, ContainerVT),
+                                 DAG.getUNDEF(ContainerVT), Mask, VL),
+                     Mask55, DAG.getUNDEF(ContainerVT), Mask, VL);
+  V = DAG.getNode(RISCVISD::SUB_VL, DL, ContainerVT, V, Tmp1,
+                  DAG.getUNDEF(ContainerVT), Mask, VL);
+
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  Tmp2 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, V, Mask33,
+                     DAG.getUNDEF(ContainerVT), Mask, VL);
+  Tmp3 = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT,
+                     DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+                                 DAG.getConstant(2, DL, ContainerVT),
+                                 DAG.getUNDEF(ContainerVT), Mask, VL),
+                     Mask33, DAG.getUNDEF(ContainerVT), Mask, VL);
+  V = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, Tmp2, Tmp3,
+                  DAG.getUNDEF(ContainerVT), Mask, VL);
+
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Tmp4 = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, V,
+                     DAG.getConstant(4, DL, ContainerVT),
+                     DAG.getUNDEF(ContainerVT), Mask, VL),
+  Tmp5 = DAG.getNode(RISCVISD::ADD_VL, DL, ContainerVT, V, Tmp4,
+                     DAG.getUNDEF(ContainerVT), Mask, VL);
+  V = DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Tmp5, Mask0F,
+                  DAG.getUNDEF(ContainerVT), Mask, VL);
+
+  if (Len > 8) {
+    // v = (v * 0x01010101...) >> (Len - 8)
+    // 0x01010101... == (0x0F0F0F0F... & (0x0F0F0F0F... >> 3))
+    const APInt &Constant01 = APInt::getSplat(Len, APInt(8, 0x01));
+    SDValue Mask01 =
+        RISCVMatInt::getIntMatCost(
+            Constant01, ContainerVT.getScalarSizeInBits(), Subtarget) > 2
+            ? DAG.getNode(RISCVISD::AND_VL, DL, ContainerVT, Mask0F,
+                          DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT, Mask0F,
+                                      DAG.getConstant(3, DL, ContainerVT),
+                                      DAG.getUNDEF(ContainerVT), Mask, VL),
+                          DAG.getUNDEF(ContainerVT), Mask, VL)
+            : DAG.getConstant(Constant01, DL, ContainerVT);
+    V = DAG.getNode(RISCVISD::SRL_VL, DL, ContainerVT,
+                    DAG.getNode(RISCVISD::MUL_VL, DL, ContainerVT, V, Mask01,
+                                DAG.getUNDEF(ContainerVT), Mask, VL),
+                    DAG.getConstant(Len - 8, DL, ContainerVT),
+                    DAG.getUNDEF(ContainerVT), Mask, VL);
+  }
+
+  if (VT.isFixedLengthVector())
+    V = convertFromScalableVector(VT, V, DAG, Subtarget);
+  return V;
+}
+
 SDValue RISCVTargetLowering::lowerFixedLengthVectorFCOPYSIGNToRVV(
     SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index c11b1464757c7f..cc8a18d9088106 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -959,6 +959,9 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerScalarCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVectorCTPOP(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 455e6e54c9b396..1eaf91096336f3 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -53,28 +53,77 @@ define i8 @test_cttz_i8(i8 %a) nounwind {
 ; RV32_NOZBB-NEXT:    li a0, 8
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_cttz_i8:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    andi a1, a0, 255
-; RV64NOZBB-NEXT:    beqz a1, .LBB0_2
-; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV64NOZBB-NEXT:    addi a1, a0, -1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    andi a1, a1, 85
-; RV64NOZBB-NEXT:    subw a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 51
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    andi a0, a0, 51
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a0, a0, 15
-; RV64NOZBB-NEXT:    ret
-; RV64NOZBB-NEXT:  .LBB0_2:
-; RV64NOZBB-NEXT:    li a0, 8
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_cttz_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    beqz a1, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    andi a3, a0, 255
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    andi a0, a0, 85
+; RV64I-NEXT:    sub a3, a3, a0
+; RV64I-NEXT:    and a0, a3, a2
+; RV64I-NEXT:    srli a3, a3, 2
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    li a0, 8
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_cttz_i8:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    andi a1, a0, 255
+; RV64M-NEXT:    beqz a1, .LBB0_2
+; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    andi a3, a0, 255
+; RV64M-NEXT:    srli a0, a0, 1
+; RV64M-NEXT:    andi a0, a0, 85
+; RV64M-NEXT:    sub a3, a3, a0
+; RV64M-NEXT:    and a0, a3, a2
+; RV64M-NEXT:    srli a3, a3, 2
+; RV64M-NEXT:    and a2, a3, a2
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
+; RV64M-NEXT:  .LBB0_2:
+; RV64M-NEXT:    li a0, 8
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i8:
 ; RV32ZBB:       # %bb.0:
@@ -154,35 +203,83 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    li a0, 16
 ; RV32_NOZBB-NEXT:    ret
 ;
-; RV64NOZBB-LABEL: test_cttz_i16:
-; RV64NOZBB:       # %bb.0:
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    beqz a1, .LBB1_2
-; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
-; RV64NOZBB-NEXT:    addi a1, a0, -1
-; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
-; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
-; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 4
-; RV64NOZBB-NEXT:    add a0, a0, a1
-; RV64NOZBB-NEXT:    andi a1, a0, 15
-; RV64NOZBB-NEXT:    slli a0, a0, 52
-; RV64NOZBB-NEXT:    srli a0, a0, 60
-; RV64NOZBB-NEXT:    add a0, a1, a0
-; RV64NOZBB-NEXT:    ret
-; RV64NOZBB-NEXT:  .LBB1_2:
-; RV64NOZBB-NEXT:    li a0, 16
-; RV64NOZBB-NEXT:    ret
+; RV64I-LABEL: test_cttz_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 48
+; RV64I-NEXT:    beqz a1, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a1, 2
+; RV64I-NEXT:    xor a2, a2, a1
+; RV64I-NEXT:    addi a3, a0, -1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    lui a4, 5
+; RV64I-NEXT:    addiw a4, a4, 1365
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    sub a0, a0, a3
+; RV64I-NEXT:    and a3, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a3, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    srli a2, a1, 3
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    li a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64M-LABEL: test_cttz_i16:
+; RV64M:       # %bb.0:
+; RV64M-NEXT:    slli a1, a0, 48
+; RV64M-NEXT:    beqz a1, .LBB1_2
+; RV64M-NEXT:  # %bb.1: # %cond.false
+; RV64M-NEXT:    lui a1, 61681
+; RV64M-NEXT:    addiw a1, a1, -241
+; RV64M-NEXT:    slli a2, a1, 32
+; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a2, a1, 2
+; RV64M-NEXT:    xor a2, a2, a1
+; RV64M-NEXT:    addi a3, a0, -1
+; RV64M-NEXT:    not a0, a0
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    srli a3, a0, 1
+; RV64M-NEXT:    lui a4, 5
+; RV64M-NEXT:    addiw a4, a4, 1365
+; RV64M-NEXT:    and a3, a3, a4
+; RV64M-NEXT:    slli a0, a0, 48
+; RV64M-NEXT:    srli a0, a0, 48
+; RV64M-NEXT:    sub a0, a0, a3
+; RV64M-NEXT:    and a3, a0, a2
+; RV64M-NEXT:    srli a0, a0, 2
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    add a0, a3, a0
+; RV64M-NEXT:    srli a2, a0, 4
+; RV64M-NEXT:    add a0, a0, a2
+; RV64M-NEXT:    and a0, a0, a1
+; RV64M-NEXT:    srli a2, a1, 3
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    srli a0, a0, 56
+; RV64M-NEXT:    ret
+; RV64M-NEXT:  .LBB1_2:
+; RV64M-NEXT:    li a0, 16
+; RV64M-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: test_cttz_i16:
 ; RV32ZBB:       # %bb.0:
@@ -422,16 +519,33 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    n...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/86926