[llvm] [IR] Add llvm `clmul` intrinsic (PR #140301)

Thu Jun 19 21:45:00 PDT 2025

https://github.com/oscardssmith updated https://github.com/llvm/llvm-project/pull/140301

>From ce5bf8d87ea02ed81a0f0fb5531978d899dca671 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Fri, 16 May 2025 12:15:08 -0400
Subject: [PATCH 01/17] add clmul docs

---
 llvm/docs/LangRef.rst                         | 48 +++++++++++++++++++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  3 ++
 llvm/include/llvm/IR/Intrinsics.td            |  8 ++++
 llvm/lib/CodeGen/IntrinsicLowering.cpp        | 23 +++++++++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  6 +++
 .../SelectionDAG/SelectionDAGDumper.cpp       |  1 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +
 7 files changed, 91 insertions(+)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index cc72a37f68599..a1ee145def5bb 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18097,6 +18097,54 @@ Example:
       %r = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)  ; %r = i8: 225 (0b11100001)
       %r = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)   ; %r = i8: 255 (0b11111111)
 
+.. clmul:
+
+'``clmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.clmul``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.clmul.i16(i16 %a, i16 %b)
+      declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+      declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.clmul``' family of intrinsics functions perform carryless multiplication
+(also known as xor multiplication) on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo carryless multiplication.
+
+Semantics:
+""""""""""
+
+The ‘llvm.clmul’ intrinsic computes carryless multiply of ``%a`` and ``%b``, which is the result
+of applying the standard multiplication algorithm if you replace all of the aditions with exclusive ors.
+The vector intrinsics, such as llvm.clmul.v4i32, operate on a per-element basis and the element order is not affected.
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.clmul.i4(i4 1, i4 2)  ; %res = 2
+      %res = call i4 @llvm.clmul.i4(i4 5, i4 6)  ; %res = 14
+      %res = call i4 @llvm.clmul.i4(i4 -4, i4 2)  ; %res = -8
+      %res = call i4 @llvm.clmul.i4(i4 -4, i4 -5)  ; %res = -12
+
 Arithmetic with Overflow Intrinsics
 -----------------------------------
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 465e4a0a9d0d8..ffb71593af8bf 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -760,6 +760,9 @@ enum NodeType {
   ROTR,
   FSHL,
   FSHR,
+  
+  /// Carryless multiplication operator
+  CLMUL,
 
   /// Byte Swap and Counting operators.
   BSWAP,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 7add4a27ce9e9..bc35b67175a7b 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1427,6 +1427,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty],
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_clmul : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
 let IntrProperties = [IntrNoMem, IntrSpeculatable,
@@ -2117,6 +2119,12 @@ let IntrProperties = [IntrNoMem] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_clmul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_sadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 1518ead7698be..d66544ee87ea4 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -199,6 +199,25 @@ static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
   return LowerCTPOP(Context, V, IP);
 }
 
+/// Emit the code to lower clmul of V1, V2 before the specified instruction IP.
+static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction *IP) {
+
+  IRBuilder<> Builder(IP);
+
+  unsigned BitSize = V1->getType()->getPrimitiveSizeInBits();
+  Value *Res = ConstantInt::get(V1->getType(), 0);
+  Value *Zero = ConstantInt::get(V1->getType(), 0);
+  Value *One = ConstantInt::get(V1->getType(), 1);
+  for (unsigned I = 1; I < BitSize; I ++) {
+    Value *LowBit = Builder.CreateAnd(V1, One, "clmul.isodd");
+    Value *Pred = Builder.CreateSelect(LowBit, V2, Zero, "clmul.V2_or_zero");
+    Res = Builder.CreateXor(Res, Pred, "clmul.Res");
+    V1 = Builder.CreateLShr(V1, One, "clmul.V1");
+    V2 = Builder.CreateShl(V2, One, "clmul.V2");
+  }
+  return LowerCTPOP(Context, Res, IP);
+}
+
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
                                        const char *Dname,
                                        const char *LDname) {
@@ -262,6 +281,10 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
     break;
 
+  case Intrinsic::clmul:
+    CI->replaceAllUsesWith(LowerCLMUL(Context, CI->getArgOperand(0), CI->getArgOperand(1), CI));
+    break;
+
   case Intrinsic::cttz: {
     // cttz(x) -> ctpop(~X & (X-1))
     Value *Src = CI->getArgOperand(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4f548cbad5c30..d2b2167a93559 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7270,6 +7270,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
     return;
   }
+  case Intrinsic::clmul: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
   case Intrinsic::sadd_sat: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 7fc15581c17e4..b8d278e3dd221 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -299,6 +299,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ROTR:                       return "rotr";
   case ISD::FSHL:                       return "fshl";
   case ISD::FSHR:                       return "fshr";
+  case ISD::CLMUL:                      return "clmul";
   case ISD::FADD:                       return "fadd";
   case ISD::STRICT_FADD:                return "strict_fadd";
   case ISD::FSUB:                       return "fsub";
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 779786fa400fc..d871462b68035 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10575,6 +10575,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(RISCVISD::MOPRR, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   }
+  case Intrinsic::clmul:
   case Intrinsic::riscv_clmul:
     return DAG.getNode(RISCVISD::CLMUL, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2));
@@ -14557,6 +14558,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
+    case Intrinsic::clmul:
     case Intrinsic::riscv_clmul: {
       if (!Subtarget.is64Bit() || N->getValueType(0) != MVT::i32)
         return;

>From 77ba45586e2f7264c8f1cb55ff09fbebbcfaf2ac Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Sun, 18 May 2025 09:13:34 -0400
Subject: [PATCH 02/17] --amend

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index a1ee145def5bb..08eb53da31b46 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18118,7 +18118,7 @@ on any integer bit width or vectors of integers.
 Overview
 """""""""
 
-The '``llvm.clmul``' family of intrinsics functions perform carryless multiplication
+The '``llvm.clmul``' family of intrinsic functions performs carryless multiplication
 (also known as xor multiplication) on the 2 arguments.
 
 Arguments

>From 32a13a757072d15907c1f562bded621ce4b130cd Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Tue, 20 May 2025 00:54:25 -0400
Subject: [PATCH 03/17] teach selection dag about clmul legalization

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  5 +++
 llvm/lib/CodeGen/IntrinsicLowering.cpp        |  2 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  4 +++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 31 +++++++++++++++++++
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  3 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     |  1 +
 6 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index dd44afd0855a5..e5ca22713e2cf 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5409,6 +5409,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
   /// \returns The expansion if successful, SDValue() otherwise
   SDValue expandFunnelShift(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand carryless multiply.
+  /// \param N Node to expand
+  /// \returns The expansion if successful, SDValue() otherwise
+  SDValue expandCLMUL(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand rotations.
   /// \param N Node to expand
   /// \param AllowVectorOps expand vector rotate, this should only be performed
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index d66544ee87ea4..8e6020d1055e9 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -215,7 +215,7 @@ static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction
     V1 = Builder.CreateLShr(V1, One, "clmul.V1");
     V2 = Builder.CreateShl(V2, One, "clmul.V2");
   }
-  return LowerCTPOP(Context, Res, IP);
+  return Res;
 }
 
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f5f4d71236fee..4c94f68385fd2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3958,6 +3958,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG))
       Results.push_back(Expanded);
     break;
+  case ISD::CLMUL:
+    if (SDValue Expanded = TLI.expandCLMUL(Node, DAG))
+      Results.push_back(Expanded);
+    break;
   case ISD::ROTL:
   case ISD::ROTR:
     if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG))
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 52f19cc6e1ab0..d6f3dc7f34699 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8147,6 +8147,37 @@ SDValue TargetLowering::expandFunnelShift(SDNode *Node,
   return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
 }
 
+SDValue TargetLowering::expandCLMUL(SDNode *Node,
+                                          SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue V1 = Node->getOperand(0);
+  SDValue V2 = Node->getOperand(1);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SHL, VT) ||
+                        !isOperationLegalOrCustom(ISD::XOR, VT) ||
+                        !isOperationLegalOrCustom(ISD::AND, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT))))
+    return SDValue();
+
+  SDValue Res = DAG.getConstant(0, DL, VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  for (unsigned i = 0; i < NumBitsPerElt; ++i) {
+    SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
+    SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBit, V2, Zero);
+    Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
+    V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
+    V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
+  }
+  return Res;
+}
+
 // TODO: Merge with expandFunnelShift.
 SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
                                   SelectionDAG &DAG) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index b1afdc2a3ac39..0613beb70cd1e 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -781,6 +781,9 @@ void TargetLoweringBase::initActions() {
     // Absolute difference
     setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand);
 
+    // Carryless multiply
+    setOperationAction(ISD::CLMUL, VT, Expand);
+
     // Saturated trunc
     setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Expand);
     setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Expand);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 4806bcc1d63de..4561f0856b56f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -798,6 +798,7 @@ def : Sh3AddPat<SH3ADD>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
+def : PatGprGpr<clmul, CLMUL>;
 def : PatGprGpr<riscv_clmul, CLMUL>;
 def : PatGprGpr<riscv_clmulh, CLMULH>;
 } // Predicates = [HasStdExtZbcOrZbkc]

>From 6b33f0736e4df72fe7a797311c6ec5a19b4485db Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Tue, 20 May 2025 01:29:35 -0400
Subject: [PATCH 04/17] fix

---
 llvm/lib/CodeGen/IntrinsicLowering.cpp    | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 8e6020d1055e9..a8c85469086b8 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -204,7 +204,7 @@ static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction
 
   IRBuilder<> Builder(IP);
 
-  unsigned BitSize = V1->getType()->getPrimitiveSizeInBits();
+  unsigned BitSize = V1->getType()->getScalarSizeInBits();
   Value *Res = ConstantInt::get(V1->getType(), 0);
   Value *Zero = ConstantInt::get(V1->getType(), 0);
   Value *One = ConstantInt::get(V1->getType(), 1);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 4561f0856b56f..7a1c39172cbaa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -54,6 +54,7 @@ def riscv_unzip   : RVSDNode<"UNZIP",   SDTIntUnaryOp>;
 def riscv_absw    : RVSDNode<"ABSW",    SDTIntUnaryOp>;
 
 // Scalar cryptography
+def clmul         : SDNode<"CLMUL",   SDTIntBinOp>;
 def riscv_clmul   : RVSDNode<"CLMUL",   SDTIntBinOp>;
 def riscv_clmulh  : RVSDNode<"CLMULH",  SDTIntBinOp>;
 def riscv_clmulr  : RVSDNode<"CLMULR",  SDTIntBinOp>;

>From b116f53a768f0d04de30142548837b40098c3ebf Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Tue, 20 May 2025 14:23:34 -0400
Subject: [PATCH 05/17] fixes

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 16 ++++++++++++----
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp      |  6 ++++--
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td        |  2 +-
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d6f3dc7f34699..3b2005fd4a257 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8148,33 +8148,41 @@ SDValue TargetLowering::expandFunnelShift(SDNode *Node,
 }
 
 SDValue TargetLowering::expandCLMUL(SDNode *Node,
-                                          SelectionDAG &DAG) const {
+                                    SelectionDAG &DAG) const {
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
   SDValue V1 = Node->getOperand(0);
   SDValue V2 = Node->getOperand(1);
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
 
+  EVT SetCCType =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   // Only expand vector types if we have the appropriate vector bit operations.
-  // This includes the operations needed to expand CTPOP if it isn't supported.
   if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
                         (!isOperationLegalOrCustom(ISD::SRL, VT) ||
                         !isOperationLegalOrCustom(ISD::SHL, VT) ||
                         !isOperationLegalOrCustom(ISD::XOR, VT) ||
                         !isOperationLegalOrCustom(ISD::AND, VT) ||
+                        !isOperationLegalOrCustom(ISD::SELECT, VT) ||
                         !isOperationLegalOrCustomOrPromote(ISD::OR, VT))))
     return SDValue();
 
   SDValue Res = DAG.getConstant(0, DL, VT);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue One = DAG.getConstant(1, DL, VT);
-  for (unsigned i = 0; i < NumBitsPerElt; ++i) {
+  for (unsigned i = 0; i < NumBitsPerElt-1; ++i) {
     SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
-    SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBit, V2, Zero);
+    SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, One, ISD::SETULT);
+    SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
     Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
     V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
     V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
   }
+  // unroll last iteration to prevent dead nodes
+  SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
+  SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, One, ISD::SETULT);
+  SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
+  Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
   return Res;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d871462b68035..570c713248400 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -400,6 +400,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                        Legal);
   }
 
+  if (Subtarget.hasStdExtZbc() || Subtarget.hasStdExtZbkc()) {
+    setOperationAction(ISD::CLMUL, XLenVT, Legal);
+  }
+
   if (Subtarget.hasStdExtZbb() ||
       (Subtarget.hasVendorXCVbitmanip() && !Subtarget.is64Bit())) {
     if (Subtarget.is64Bit())
@@ -10575,7 +10579,6 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(RISCVISD::MOPRR, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case Intrinsic::clmul:
   case Intrinsic::riscv_clmul:
     return DAG.getNode(RISCVISD::CLMUL, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2));
@@ -14558,7 +14561,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
-    case Intrinsic::clmul:
     case Intrinsic::riscv_clmul: {
       if (!Subtarget.is64Bit() || N->getValueType(0) != MVT::i32)
         return;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 7a1c39172cbaa..577089d8c011e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -54,7 +54,7 @@ def riscv_unzip   : RVSDNode<"UNZIP",   SDTIntUnaryOp>;
 def riscv_absw    : RVSDNode<"ABSW",    SDTIntUnaryOp>;
 
 // Scalar cryptography
-def clmul         : SDNode<"CLMUL",   SDTIntBinOp>;
+def clmul         : RVSDNode<"CLMUL",   SDTIntBinOp>;
 def riscv_clmul   : RVSDNode<"CLMUL",   SDTIntBinOp>;
 def riscv_clmulh  : RVSDNode<"CLMULH",  SDTIntBinOp>;
 def riscv_clmulr  : RVSDNode<"CLMULR",  SDTIntBinOp>;

>From 8469a5990522544be7f224a820c52b37a32bf35c Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Tue, 20 May 2025 14:26:04 -0400
Subject: [PATCH 06/17] remvoe int_vp_clmul

---
 llvm/include/llvm/IR/Intrinsics.td | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index bc35b67175a7b..dbd31fc7d0cfe 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2119,12 +2119,6 @@ let IntrProperties = [IntrNoMem] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
-  def int_vp_clmul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
-                             [ LLVMMatchType<0>,
-                               LLVMMatchType<0>,
-                               LLVMMatchType<0>,
-                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                               llvm_i32_ty]>;
   def int_vp_sadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,

>From 3a053d5f93a4f640785b911814afd29939e86fe0 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Tue, 20 May 2025 21:39:09 -0400
Subject: [PATCH 07/17] finish hooking up CLMUL to selectiondag?

---
 llvm/docs/LangRef.rst                         |  2 +-
 llvm/include/llvm/IR/Intrinsics.td            |  2 +-
 llvm/lib/CodeGen/IntrinsicLowering.cpp        |  2 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 36 ++++++++++++++++++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  3 ++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 20 +++++------
 7 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 08eb53da31b46..0098d6f58aac3 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18132,7 +18132,7 @@ Semantics:
 """"""""""
 
 The ‘llvm.clmul’ intrinsic computes carryless multiply of ``%a`` and ``%b``, which is the result
-of applying the standard multiplication algorithm if you replace all of the aditions with exclusive ors.
+of applying the standard multiplication algorithm if you replace all of the additions with exclusive ors.
 The vector intrinsics, such as llvm.clmul.v4i32, operate on a per-element basis and the element order is not affected.
 
 Examples
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index dbd31fc7d0cfe..e92fd71ea9681 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1428,7 +1428,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty],
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_clmul : DefaultAttrsIntrinsic<[llvm_anyint_ty],
-      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+      [LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
 let IntrProperties = [IntrNoMem, IntrSpeculatable,
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index a8c85469086b8..9111790e0193b 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -208,7 +208,7 @@ static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction
   Value *Res = ConstantInt::get(V1->getType(), 0);
   Value *Zero = ConstantInt::get(V1->getType(), 0);
   Value *One = ConstantInt::get(V1->getType(), 1);
-  for (unsigned I = 1; I < BitSize; I ++) {
+  for (unsigned I = 1; I < BitSize; I++) {
     Value *LowBit = Builder.CreateAnd(V1, One, "clmul.isodd");
     Value *Pred = Builder.CreateSelect(LowBit, V2, Zero, "clmul.V2_or_zero");
     Res = Builder.CreateXor(Res, Pred, "clmul.Res");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index dd64676222055..7057d17c603e6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -208,7 +208,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_XOR:
   case ISD::VP_ADD:
   case ISD::VP_SUB:
-  case ISD::VP_MUL:      Res = PromoteIntRes_SimpleIntBinOp(N); break;
+  case ISD::VP_MUL:
+  case ISD::CLMUL:      Res = PromoteIntRes_SimpleIntBinOp(N); break;
 
   case ISD::ABDS:
   case ISD::AVGCEILS:
@@ -3111,6 +3112,10 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
     ExpandIntRes_FunnelShift(N, Lo, Hi);
     break;
 
+  case ISD::CLMUL:
+    ExpandIntRes_CLMUL(N, Lo, Hi);
+    break;
+
   case ISD::VSCALE:
     ExpandIntRes_VSCALE(N, Lo, Hi);
     break;
@@ -5438,6 +5443,35 @@ void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  // Values numbered from least significant to most significant.
+  SDValue In1, In2, In3, In4;
+  GetExpandedInteger(N->getOperand(0), In3, In4);
+  GetExpandedInteger(N->getOperand(1), In1, In2);
+  EVT HalfVT = In1.getValueType();
+  SDLoc DL(N);
+  
+  // CLMUL is carryless so Lo is computed from the low half
+  Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, In1, In3);
+  // the high bits not included in CLMUL(A,B) can be computed by
+  // BITREVERSE(CLMUL(BITREVERSE(A), BITREVERSE(B))) >> 1
+  // Therefore we can compute the 2 hi/lo cross products
+  // and the the overflow of the low product
+  // and xor them together to compute HI
+  SDValue BitRevIn1 = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, In1);
+  SDValue BitRevIn3 = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, In3);
+  SDValue BitRevLoHi = DAG.getNode(ISD::CLMUL, DL, HalfVT, BitRevIn1, BitRevIn3);
+  SDValue LoHi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, BitRevLoHi);
+  SDValue One = DAG.getConstant(0, DL, HalfVT);
+  Hi = DAG.getNode(ISD::SRL, DL, HalfVT, LoHi, One);
+  
+  SDValue HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, In1, In4);
+  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HITMP);
+  HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, In2, In3);
+  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HITMP);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a541833684f38..804533954ac3a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -509,6 +509,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   void ExpandIntRes_Rotate            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_FunnelShift       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CLMUL             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_VSCALE            (SDNode *N, SDValue &Lo, SDValue &Hi);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index c56cfec81acdd..ed038afe58fa9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -166,6 +166,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::CLMUL:
 
   case ISD::SADDSAT:
   case ISD::UADDSAT:
@@ -1330,6 +1331,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
   case ISD::UMAX: case ISD::VP_UMAX:
+  case ISD::CLMUL:
   case ISD::SADDSAT: case ISD::VP_SADDSAT:
   case ISD::UADDSAT: case ISD::VP_UADDSAT:
   case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
@@ -4691,6 +4693,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
   case ISD::SSHLSAT:
   case ISD::USHLSAT:
+  case ISD::CLMUL:
   case ISD::ROTL:
   case ISD::ROTR:
   case ISD::AVGFLOORS:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3b2005fd4a257..1cff21f01891f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8163,26 +8163,22 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node,
                         !isOperationLegalOrCustom(ISD::SHL, VT) ||
                         !isOperationLegalOrCustom(ISD::XOR, VT) ||
                         !isOperationLegalOrCustom(ISD::AND, VT) ||
-                        !isOperationLegalOrCustom(ISD::SELECT, VT) ||
-                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT))))
+                        !isOperationLegalOrCustom(ISD::SELECT, VT))))
     return SDValue();
 
   SDValue Res = DAG.getConstant(0, DL, VT);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue One = DAG.getConstant(1, DL, VT);
-  for (unsigned i = 0; i < NumBitsPerElt-1; ++i) {
+  for (unsigned I = 0; I < NumBitsPerElt-1; ++I) {
     SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
-    SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, One, ISD::SETULT);
+    SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, Zero, ISD::SETNE);
     SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
     Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
-    V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
-    V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
-  }
-  // unroll last iteration to prevent dead nodes
-  SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
-  SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, One, ISD::SETULT);
-  SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
-  Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
+    if (I != NumBitsPerElt) {
+      V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
+      V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
+    }
+  }
   return Res;
 }
 

>From f08fd4dd4e7ee05db2a17a48ddcc7262fb5e6c5f Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscar.smith at juliacomputing.com>
Date: Wed, 21 May 2025 21:31:17 +0000
Subject: [PATCH 08/17] cleanup

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  3 +--
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 22 +++++++++----------
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  6 ++++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  4 ++--
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 4c94f68385fd2..57d7c90b3cdbc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3959,8 +3959,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(Expanded);
     break;
   case ISD::CLMUL:
-    if (SDValue Expanded = TLI.expandCLMUL(Node, DAG))
-      Results.push_back(Expanded);
+    Results.push_back(TLI.expandCLMUL(Node, DAG));
     break;
   case ISD::ROTL:
   case ISD::ROTR:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 7057d17c603e6..ad508ee6f9118 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5446,29 +5446,29 @@ void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   // Values numbered from least significant to most significant.
-  SDValue In1, In2, In3, In4;
-  GetExpandedInteger(N->getOperand(0), In3, In4);
-  GetExpandedInteger(N->getOperand(1), In1, In2);
-  EVT HalfVT = In1.getValueType();
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  EVT HalfVT = LL.getValueType();
   SDLoc DL(N);
   
   // CLMUL is carryless so Lo is computed from the low half
-  Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, In1, In3);
+  Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RL);
   // the high bits not included in CLMUL(A,B) can be computed by
   // BITREVERSE(CLMUL(BITREVERSE(A), BITREVERSE(B))) >> 1
   // Therefore we can compute the 2 hi/lo cross products
   // and the the overflow of the low product
   // and xor them together to compute HI
-  SDValue BitRevIn1 = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, In1);
-  SDValue BitRevIn3 = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, In3);
-  SDValue BitRevLoHi = DAG.getNode(ISD::CLMUL, DL, HalfVT, BitRevIn1, BitRevIn3);
+  SDValue BitRevLL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, LL);
+  SDValue BitRevRL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, RL);
+  SDValue BitRevLoHi = DAG.getNode(ISD::CLMUL, DL, HalfVT, BitRevLL, BitRevRL);
   SDValue LoHi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, BitRevLoHi);
-  SDValue One = DAG.getConstant(0, DL, HalfVT);
+  SDValue One = DAG.getShiftAmountConstant(1, HalfVT, DL);
   Hi = DAG.getNode(ISD::SRL, DL, HalfVT, LoHi, One);
   
-  SDValue HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, In1, In4);
+  SDValue HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RH);
   Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HITMP);
-  HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, In2, In3);
+  HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, LH, RL);
   Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HITMP);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d2b2167a93559..67929ccb72334 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7273,7 +7273,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::clmul: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
-    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Op1.getValueType(), Op1, Op2));
+    EVT VT =  Op1.getValueType();
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(Op1.getValueType() == Op2.getValueType() &&
+           Op1.getValueType() == VT && "Binary operator types must match!");
+    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, VT, Op1, Op2));
     return;
   }
   case Intrinsic::sadd_sat: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 1cff21f01891f..5e9d99b4d6b0f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8169,12 +8169,12 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node,
   SDValue Res = DAG.getConstant(0, DL, VT);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue One = DAG.getConstant(1, DL, VT);
-  for (unsigned I = 0; I < NumBitsPerElt-1; ++I) {
+  for (unsigned I = 0; I < NumBitsPerElt; ++I) {
     SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
     SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, Zero, ISD::SETNE);
     SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
     Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
-    if (I != NumBitsPerElt) {
+    if (I != NumBitsPerElt-1) {
       V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
       V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
     }

>From 15bc07c5c1643bde04f575dc85fdeaf499403cfd Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscar.smith at juliacomputing.com>
Date: Wed, 21 May 2025 21:38:19 +0000
Subject: [PATCH 09/17] scalarize vector clmul on error

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5e9d99b4d6b0f..c3c2097dcac38 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8158,13 +8158,15 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node,
   EVT SetCCType =
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   // Only expand vector types if we have the appropriate vector bit operations.
+  // FIXME: Should really try to split the vector in case it's legal on a
+  // subvector.
   if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
                         (!isOperationLegalOrCustom(ISD::SRL, VT) ||
                         !isOperationLegalOrCustom(ISD::SHL, VT) ||
                         !isOperationLegalOrCustom(ISD::XOR, VT) ||
                         !isOperationLegalOrCustom(ISD::AND, VT) ||
                         !isOperationLegalOrCustom(ISD::SELECT, VT))))
-    return SDValue();
+    return DAG.UnrollVectorOp(Node);
 
   SDValue Res = DAG.getConstant(0, DL, VT);
   SDValue Zero = DAG.getConstant(0, DL, VT);

>From be25684934a4ea941cd7fddbe959ac0bcb747541 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Wed, 21 May 2025 19:57:37 -0400
Subject: [PATCH 10/17] address review

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 1 +
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp         | 1 +
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp  | 6 +-----
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp       | 7 ++++---
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ad508ee6f9118..be9a35e6038d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5459,6 +5459,7 @@ void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
   // Therefore we can compute the 2 hi/lo cross products
   // and the the overflow of the low product
   // and xor them together to compute HI
+  // TODO: if the target supports a widening CLMUL or a CLMULH we should probably use that
   SDValue BitRevLL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, LL);
   SDValue BitRevRL = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, RL);
   SDValue BitRevLoHi = DAG.getNode(ISD::CLMUL, DL, HalfVT, BitRevLL, BitRevRL);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 45a37622a531b..86feaaa94aad1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7400,6 +7400,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::SSUBSAT:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
+  case ISD::CLMUL:
     assert(VT.isInteger() && "This operator does not apply to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 67929ccb72334..d2b2167a93559 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7273,11 +7273,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::clmul: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
-    EVT VT =  Op1.getValueType();
-    assert(VT.isInteger() && "This operator does not apply to FP types!");
-    assert(Op1.getValueType() == Op2.getValueType() &&
-           Op1.getValueType() == VT && "Binary operator types must match!");
-    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, VT, Op1, Op2));
+    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Op1.getValueType(), Op1, Op2));
     return;
   }
   case Intrinsic::sadd_sat: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c3c2097dcac38..7babcfdf24ae4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8171,14 +8171,15 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node,
   SDValue Res = DAG.getConstant(0, DL, VT);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue OneForShift = DAG.getShiftAmountConstant(1, VT, DL);
   for (unsigned I = 0; I < NumBitsPerElt; ++I) {
     SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
     SDValue LowBool = DAG.getSetCC(DL, SetCCType, LowBit, Zero, ISD::SETNE);
     SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBool, V2, Zero);
     Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
-    if (I != NumBitsPerElt-1) {
-      V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
-      V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
+    if (I != NumBitsPerElt - 1) {
+      V1 = DAG.getNode(ISD::SRL, DL, VT, V1, OneForShift);
+      V2 = DAG.getNode(ISD::SHL, DL, VT, V2, OneForShift);
     }
   }
   return Res;

>From 54bc719a6b0fe14d58a71088cce061b142c6a7d0 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Sat, 31 May 2025 15:49:15 -0400
Subject: [PATCH 11/17] format

---
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index be9a35e6038d8..1fb76d253749d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5444,7 +5444,7 @@ void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
 }
 
 void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
-                                                SDValue &Hi) {
+                                          SDValue &Hi) {
   // Values numbered from least significant to most significant.
   SDValue LL, LH, RL, RH;
   GetExpandedInteger(N->getOperand(0), LL, LH);
@@ -5452,9 +5452,10 @@ void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
   EVT HalfVT = LL.getValueType();
   SDLoc DL(N);
   
-  // CLMUL is carryless so Lo is computed from the low half
+  // Lo is computed from the low half
   Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RL);
-  // the high bits not included in CLMUL(A,B) can be computed by
+  // CLMUL is carryless so the high bits not included in CLMUL(A,B)
+  // can be computed by
   // BITREVERSE(CLMUL(BITREVERSE(A), BITREVERSE(B))) >> 1
   // Therefore we can compute the 2 hi/lo cross products
   // and the the overflow of the low product
@@ -5467,10 +5468,10 @@ void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo,
   SDValue One = DAG.getShiftAmountConstant(1, HalfVT, DL);
   Hi = DAG.getNode(ISD::SRL, DL, HalfVT, LoHi, One);
   
-  SDValue HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RH);
-  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HITMP);
-  HITMP = DAG.getNode(ISD::CLMUL, DL, HalfVT, LH, RL);
-  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HITMP);
+  SDValue HiTmp = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RH);
+  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HiTmp);
+  HiTmp = DAG.getNode(ISD::CLMUL, DL, HalfVT, LH, RL);
+  Hi = DAG.getNode(ISD::XOR, DL, HalfVT, Hi, HiTmp);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,

>From 24021df66b5994f9647b38e62c986df468642fa5 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Sat, 31 May 2025 17:00:33 -0400
Subject: [PATCH 12/17] dont support fastisel

---
 llvm/lib/CodeGen/IntrinsicLowering.cpp | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 9111790e0193b..1518ead7698be 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -199,25 +199,6 @@ static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
   return LowerCTPOP(Context, V, IP);
 }
 
-/// Emit the code to lower clmul of V1, V2 before the specified instruction IP.
-static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction *IP) {
-
-  IRBuilder<> Builder(IP);
-
-  unsigned BitSize = V1->getType()->getScalarSizeInBits();
-  Value *Res = ConstantInt::get(V1->getType(), 0);
-  Value *Zero = ConstantInt::get(V1->getType(), 0);
-  Value *One = ConstantInt::get(V1->getType(), 1);
-  for (unsigned I = 1; I < BitSize; I++) {
-    Value *LowBit = Builder.CreateAnd(V1, One, "clmul.isodd");
-    Value *Pred = Builder.CreateSelect(LowBit, V2, Zero, "clmul.V2_or_zero");
-    Res = Builder.CreateXor(Res, Pred, "clmul.Res");
-    V1 = Builder.CreateLShr(V1, One, "clmul.V1");
-    V2 = Builder.CreateShl(V2, One, "clmul.V2");
-  }
-  return Res;
-}
-
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
                                        const char *Dname,
                                        const char *LDname) {
@@ -281,10 +262,6 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
     break;
 
-  case Intrinsic::clmul:
-    CI->replaceAllUsesWith(LowerCLMUL(Context, CI->getArgOperand(0), CI->getArgOperand(1), CI));
-    break;
-
   case Intrinsic::cttz: {
     // cttz(x) -> ctpop(~X & (X-1))
     Value *Src = CI->getArgOperand(0);

>From 23bb2ebb191affc9ed7013030b34a96b82ae82ec Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Sat, 31 May 2025 17:00:40 -0400
Subject: [PATCH 13/17] add riscv tests

---
 .../CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll   | 11 ++++++++++
 .../CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll   | 22 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
index 51fd086e26dfe..874776370999c 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
@@ -15,6 +15,17 @@ define i32 @clmul32(i32 %a, i32 %b) nounwind {
   ret i32 %tmp
 }
 
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+
+define i32 @generic_clmul32(i32 %a, i32 %b) nounwind {
+; RV32ZBC-ZBKC-LABEL: clmul32:
+; RV32ZBC-ZBKC:       # %bb.0:
+; RV32ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV32ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
 declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
 
 define i32 @clmul32h(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
index aa9e89bc20953..4d689de07637f 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
@@ -15,6 +15,17 @@ define i64 @clmul64(i64 %a, i64 %b) nounwind {
   ret i64 %tmp
 }
 
+declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+
+define i64 @generic_clmul64(i64 %a, i64 %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul64:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i64 @llvm.clmul.i64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
 declare i64 @llvm.riscv.clmulh.i64(i64 %a, i64 %b)
 
 define i64 @clmul64h(i64 %a, i64 %b) nounwind {
@@ -37,6 +48,17 @@ define signext i32 @clmul32(i32 signext %a, i32 signext %b) nounwind {
   %tmp = call i32 @llvm.riscv.clmul.i32(i32 %a, i32 %b)
   ret i32 %tmp
 }
+declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+
+define signext i32 @generic_clmul32(i32 signext %a, i32 signext %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul32:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    sext.w a0, a0
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
 
 declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
 

>From d9ed5b21641d532bc64f311ea6e88a48bb3d8b38 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Sat, 31 May 2025 17:33:36 -0400
Subject: [PATCH 14/17] try fixing test

---
 llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll | 2 +-
 llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
index 874776370999c..cb190f8ee90c2 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbc-zbkc-intrinsic.ll
@@ -18,7 +18,7 @@ define i32 @clmul32(i32 %a, i32 %b) nounwind {
 declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
 
 define i32 @generic_clmul32(i32 %a, i32 %b) nounwind {
-; RV32ZBC-ZBKC-LABEL: clmul32:
+; RV32ZBC-ZBKC-LABEL: generic_clmul32:
 ; RV32ZBC-ZBKC:       # %bb.0:
 ; RV32ZBC-ZBKC-NEXT:    clmul a0, a0, a1
 ; RV32ZBC-ZBKC-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
index 4d689de07637f..7a535e93791cb 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbc-zbkc-intrinsic.ll
@@ -18,7 +18,7 @@ define i64 @clmul64(i64 %a, i64 %b) nounwind {
 declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
 
 define i64 @generic_clmul64(i64 %a, i64 %b) nounwind {
-; RV64ZBC-ZBKC-LABEL: clmul64:
+; RV64ZBC-ZBKC-LABEL: generic_clmul64:
 ; RV64ZBC-ZBKC:       # %bb.0:
 ; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
 ; RV64ZBC-ZBKC-NEXT:    ret
@@ -51,7 +51,7 @@ define signext i32 @clmul32(i32 signext %a, i32 signext %b) nounwind {
 declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
 
 define signext i32 @generic_clmul32(i32 signext %a, i32 signext %b) nounwind {
-; RV64ZBC-ZBKC-LABEL: clmul32:
+; RV64ZBC-ZBKC-LABEL: generic_clmul32:
 ; RV64ZBC-ZBKC:       # %bb.0:
 ; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
 ; RV64ZBC-ZBKC-NEXT:    sext.w a0, a0

>From 434f7f0595d43d1b2c4fc656add26b96080189b3 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Mon, 2 Jun 2025 20:33:16 -0400
Subject: [PATCH 15/17] address review

---
 llvm/include/llvm/Target/TargetSelectionDAG.td | 2 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td      | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 9ac228110eb9c..280f1103c0b29 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -437,6 +437,8 @@ def sra_parts  : SDNode<"ISD::SRA_PARTS" , SDTIntShiftPairOp>;
 def srl_parts  : SDNode<"ISD::SRL_PARTS" , SDTIntShiftPairOp>;
 def fshl       : SDNode<"ISD::FSHL"      , SDTIntShiftDOp>;
 def fshr       : SDNode<"ISD::FSHR"      , SDTIntShiftDOp>;
+def clmul      : SDNode<"ISD::CLMUL"     , SDTIntBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
 def and        : SDNode<"ISD::AND"       , SDTIntBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def or         : SDNode<"ISD::OR"        , SDTIntBinOp,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 577089d8c011e..4561f0856b56f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -54,7 +54,6 @@ def riscv_unzip   : RVSDNode<"UNZIP",   SDTIntUnaryOp>;
 def riscv_absw    : RVSDNode<"ABSW",    SDTIntUnaryOp>;
 
 // Scalar cryptography
-def clmul         : RVSDNode<"CLMUL",   SDTIntBinOp>;
 def riscv_clmul   : RVSDNode<"CLMUL",   SDTIntBinOp>;
 def riscv_clmulh  : RVSDNode<"CLMULH",  SDTIntBinOp>;
 def riscv_clmulr  : RVSDNode<"CLMULR",  SDTIntBinOp>;

>From b5f6f512d2dab683561fa012d8947b1175c4fbe0 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Fri, 20 Jun 2025 00:01:27 -0400
Subject: [PATCH 16/17] add clmul to commutative intrinsics

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e5ca22713e2cf..01ba2cf45a272 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3002,6 +3002,7 @@ class LLVM_ABI TargetLoweringBase {
     case ISD::AVGCEILU:
     case ISD::ABDS:
     case ISD::ABDU:
+    case ISD::CLMUL:
       return true;
     default: return false;
     }

>From cca2a8d93ce1eb24c28e77bb3c0d6a7fccd2347d Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Fri, 20 Jun 2025 00:44:40 -0400
Subject: [PATCH 17/17] start adding legalization tests

---
 .../CodeGen/RISCV/rv64zbc+zbkb-intrinsic.ll   | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rv64zbc+zbkb-intrinsic.ll

diff --git a/llvm/test/CodeGen/RISCV/rv64zbc+zbkb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbc+zbkb-intrinsic.ll
new file mode 100644
index 0000000000000..e55031659ed15
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64zbc+zbkb-intrinsic.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+zbc,+zbkb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64ZBC-ZBKC
+; RUN: llc -mtriple=riscv64 -mattr=+zbkc,+zbkb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64ZBC-ZBKC
+
+declare i128 @llvm.clmul.i128(i128 %a, i128 %b)
+
+; FIXME: This should compile down to a clmulh a0 a0 a1
+; but that would require ISD understanding clmulh
+define i64 @generic_clmulh64_manual(i64 %a, i64 %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: generic_clmulh64_manual:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmulh a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    ret
+  %1 = zext i64 %a to i128
+  %2 = zext i64 %b to i128
+  %tmp = call i128 @llvm.clmul.i128(i128 %1, i128 %2)
+  %4 = ashr i128 %tmp, 65
+  %5 = trunc i128 %4 to i64
+  ret i64 %5
+}
\ No newline at end of file