[llvm] [IR] Add llvm `clmul` intrinsic (PR #140301)

Mon May 19 21:54:36 PDT 2025

https://github.com/oscardssmith updated https://github.com/llvm/llvm-project/pull/140301

>From c74bfec5b0224309a203d7c9636c8ff2b2c4318a Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Fri, 16 May 2025 12:15:08 -0400
Subject: [PATCH 1/3] add clmul docs

---
 llvm/docs/LangRef.rst                         | 48 +++++++++++++++++++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  3 ++
 llvm/include/llvm/IR/Intrinsics.td            |  8 ++++
 llvm/lib/CodeGen/IntrinsicLowering.cpp        | 23 +++++++++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  6 +++
 .../SelectionDAG/SelectionDAGDumper.cpp       |  1 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  2 +
 7 files changed, 91 insertions(+)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 343ca743c74f8..1444654a248e2 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18072,6 +18072,54 @@ Example:
       %r = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)  ; %r = i8: 225 (0b11100001)
       %r = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)   ; %r = i8: 255 (0b11111111)
 
+.. clmul:
+
+'``clmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.clmul``
+on any integer bit width or vectors of integers.
+
+::
+
+      declare i16 @llvm.clmul.i16(i16 %a, i16 %b)
+      declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+      declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+      declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview
+"""""""""
+
+The '``llvm.clmul``' family of intrinsics functions perform carryless multiplication
+(also known as xor multiplication) on the 2 arguments.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+values that will undergo carryless multiplication.
+
+Semantics:
+""""""""""
+
+The ‘llvm.clmul’ intrinsic computes carryless multiply of ``%a`` and ``%b``, which is the result
+of applying the standard multiplication algorithm if you replace all of the aditions with exclusive ors.
+The vector intrinsics, such as llvm.clmul.v4i32, operate on a per-element basis and the element order is not affected.
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+      %res = call i4 @llvm.clmul.i4(i4 1, i4 2)  ; %res = 2
+      %res = call i4 @llvm.clmul.i4(i4 5, i4 6)  ; %res = 14
+      %res = call i4 @llvm.clmul.i4(i4 -4, i4 2)  ; %res = -8
+      %res = call i4 @llvm.clmul.i4(i4 -4, i4 -5)  ; %res = -12
+
 Arithmetic with Overflow Intrinsics
 -----------------------------------
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 9f66402e4c820..fc3b3b26cbe5e 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -751,6 +751,9 @@ enum NodeType {
   ROTR,
   FSHL,
   FSHR,
+  
+  /// Carryless multiplication operator
+  CLMUL,
 
   /// Byte Swap and Counting operators.
   BSWAP,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e1a135a5ad48e..1857829910340 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1431,6 +1431,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty],
       [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_clmul : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }
 
 let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn,
@@ -2103,6 +2105,12 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_clmul : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_sadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 1518ead7698be..d66544ee87ea4 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -199,6 +199,25 @@ static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
   return LowerCTPOP(Context, V, IP);
 }
 
+/// Emit the code to lower clmul of V1, V2 before the specified instruction IP.
+static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction *IP) {
+
+  IRBuilder<> Builder(IP);
+
+  unsigned BitSize = V1->getType()->getPrimitiveSizeInBits();
+  Value *Res = ConstantInt::get(V1->getType(), 0);
+  Value *Zero = ConstantInt::get(V1->getType(), 0);
+  Value *One = ConstantInt::get(V1->getType(), 1);
+  for (unsigned I = 1; I < BitSize; I ++) {
+    Value *LowBit = Builder.CreateAnd(V1, One, "clmul.isodd");
+    Value *Pred = Builder.CreateSelect(LowBit, V2, Zero, "clmul.V2_or_zero");
+    Res = Builder.CreateXor(Res, Pred, "clmul.Res");
+    V1 = Builder.CreateLShr(V1, One, "clmul.V1");
+    V2 = Builder.CreateShl(V2, One, "clmul.V2");
+  }
+  return LowerCTPOP(Context, Res, IP);
+}
+
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
                                        const char *Dname,
                                        const char *LDname) {
@@ -262,6 +281,10 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
     break;
 
+  case Intrinsic::clmul:
+    CI->replaceAllUsesWith(LowerCLMUL(Context, CI->getArgOperand(0), CI->getArgOperand(1), CI));
+    break;
+
   case Intrinsic::cttz: {
     // cttz(x) -> ctpop(~X & (X-1))
     Value *Src = CI->getArgOperand(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3ebd3a4b88097..a4e350702ebc8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7188,6 +7188,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     }
     return;
   }
+  case Intrinsic::clmul: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::CLMUL, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
   case Intrinsic::sadd_sat: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 803894e298dd5..4fba332f806fd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -298,6 +298,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ROTR:                       return "rotr";
   case ISD::FSHL:                       return "fshl";
   case ISD::FSHR:                       return "fshr";
+  case ISD::CLMUL:                      return "clmul";
   case ISD::FADD:                       return "fadd";
   case ISD::STRICT_FADD:                return "strict_fadd";
   case ISD::FSUB:                       return "fsub";
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5e761fccc815a..470685b9c68bb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10347,6 +10347,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(RISCVISD::MOPRR, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
   }
+  case Intrinsic::clmul:
   case Intrinsic::riscv_clmul:
     return DAG.getNode(RISCVISD::CLMUL, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2));
@@ -14283,6 +14284,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
+    case Intrinsic::clmul:
     case Intrinsic::riscv_clmul: {
       if (!Subtarget.is64Bit() || N->getValueType(0) != MVT::i32)
         return;

>From 1717aac5530bb180fd180801c0848bfdde5c25a1 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Sun, 18 May 2025 09:13:34 -0400
Subject: [PATCH 2/3] --amend

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 1444654a248e2..fc1daa2f2adbd 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18093,7 +18093,7 @@ on any integer bit width or vectors of integers.
 Overview
 """""""""
 
-The '``llvm.clmul``' family of intrinsics functions perform carryless multiplication
+The '``llvm.clmul``' family of intrinsic functions performs carryless multiplication
 (also known as xor multiplication) on the 2 arguments.
 
 Arguments

>From d9d04fe9a80e59dd4071146933faac0f073d59c3 Mon Sep 17 00:00:00 2001
From: Oscar Smith <oscardssmith at gmail.com>
Date: Tue, 20 May 2025 00:54:25 -0400
Subject: [PATCH 3/3] teach selection dag about clmul legalization

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  5 +++
 llvm/lib/CodeGen/IntrinsicLowering.cpp        |  2 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  4 +++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 31 +++++++++++++++++++
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  3 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     |  1 +
 6 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 03099e9ad44dc..fd773e7643dbe 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5375,6 +5375,11 @@ class TargetLowering : public TargetLoweringBase {
   /// \returns The expansion if successful, SDValue() otherwise
   SDValue expandFunnelShift(SDNode *N, SelectionDAG &DAG) const;
 
+  /// Expand carryless multiply.
+  /// \param N Node to expand
+  /// \returns The expansion if successful, SDValue() otherwise
+  SDValue expandCLMUL(SDNode *N, SelectionDAG &DAG) const;
+
   /// Expand rotations.
   /// \param N Node to expand
   /// \param AllowVectorOps expand vector rotate, this should only be performed
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index d66544ee87ea4..8e6020d1055e9 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -215,7 +215,7 @@ static Value *LowerCLMUL(LLVMContext &Context, Value *V1, Value *V2, Instruction
     V1 = Builder.CreateLShr(V1, One, "clmul.V1");
     V2 = Builder.CreateShl(V2, One, "clmul.V2");
   }
-  return LowerCTPOP(Context, Res, IP);
+  return Res;
 }
 
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 528c07cc5549d..8dee5d1d769b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3907,6 +3907,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG))
       Results.push_back(Expanded);
     break;
+  case ISD::CLMUL:
+    if (SDValue Expanded = TLI.expandCLMUL(Node, DAG))
+      Results.push_back(Expanded);
+    break;
   case ISD::ROTL:
   case ISD::ROTR:
     if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG))
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index da999b5057d49..071e560fc8a96 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8131,6 +8131,37 @@ SDValue TargetLowering::expandFunnelShift(SDNode *Node,
   return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
 }
 
+SDValue TargetLowering::expandCLMUL(SDNode *Node,
+                                          SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue V1 = Node->getOperand(0);
+  SDValue V2 = Node->getOperand(1);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SHL, VT) ||
+                        !isOperationLegalOrCustom(ISD::XOR, VT) ||
+                        !isOperationLegalOrCustom(ISD::AND, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT))))
+    return SDValue();
+
+  SDValue Res = DAG.getConstant(0, DL, VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  for (unsigned i = 0; i < NumBitsPerElt; ++i) {
+    SDValue LowBit = DAG.getNode(ISD::AND, DL, VT, V1, One);
+    SDValue Pred = DAG.getNode(ISD::SELECT, DL, VT, LowBit, V2, Zero);
+    Res = DAG.getNode(ISD::XOR, DL, VT, Res, Pred);
+    V1 = DAG.getNode(ISD::SRL, DL, VT, V1, One);
+    V2 = DAG.getNode(ISD::SHL, DL, VT, V2, One);
+  }
+  return Res;
+}
+
 // TODO: Merge with expandFunnelShift.
 SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
                                   SelectionDAG &DAG) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c85f0c71ef25f..8df1c863d2d6e 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -781,6 +781,9 @@ void TargetLoweringBase::initActions() {
     // Absolute difference
     setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand);
 
+    // Carryless multiply
+    setOperationAction(ISD::CLMUL, VT, Expand);
+
     // Saturated trunc
     setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Expand);
     setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Expand);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 4353e94bdb1d0..fbd227922bd72 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -794,6 +794,7 @@ def : Sh3AddPat<SH3ADD>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
 let Predicates = [HasStdExtZbcOrZbkc] in {
+def : PatGprGpr<clmul, CLMUL>;
 def : PatGprGpr<riscv_clmul, CLMUL>;
 def : PatGprGpr<riscv_clmulh, CLMULH>;
 } // Predicates = [HasStdExtZbcOrZbkc]