[llvm] [WebAssembly] Add more lowerings for wide-arithmetic (PR #132430)

Alex Crichton via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 31 07:39:30 PDT 2025


https://github.com/alexcrichton updated https://github.com/llvm/llvm-project/pull/132430

>From e42d0bcf257c3ab610898d43fca7bf8f0d5abd74 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex at alexcrichton.com>
Date: Fri, 14 Mar 2025 07:15:52 -0700
Subject: [PATCH] [WebAssembly] Add more lowerings for wide-arithmetic

This commit is the result of investigation and discussion on
WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128`
instruction were discussed but ultimately deferred to a future proposal.
In spite of this though I wanted to apply a few changes to the LLVM
backend here with `wide-arithmetic` enabled for a few minor changes:

* A lowering for the `ISD::UADDO` node is added which uses `add128`
  where the upper bits of the two operands are constant zeros and the
  result of the 128-bit addition is the result of the overflowing addition.
* The high bits of a `I64_ADD128` node are now flagged as "known zero"
  if the upper bits of the inputs are also zero, assisting this `UADDO`
  lowering to ensure the backend knows that the carry result is a 1-bit
  result.

A few tests were then added to showcase various lowerings for various
operations that can be done with wide-arithmetic. They don't all
optimize super well at this time but I wanted to add them as a reference
here regardless to have them on-hand for future evaluations if
necessary.
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  44 +++++-
 .../WebAssembly/WebAssemblyISelLowering.h     |   1 +
 .../CodeGen/WebAssembly/wide-arithmetic.ll    | 134 ++++++++++++++++++
 3 files changed, 176 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 9ae46e709d823..82d3b8e292e60 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setOperationAction(ISD::SUB, MVT::i128, Custom);
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+    setOperationAction(ISD::UADDO, MVT::i64, Custom);
   }
 
   if (Subtarget->hasNontrappingFPToInt())
@@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
     }
     }
   }
+
+  // For 128-bit addition if the upper bits are all zero then it's known that
+  // the upper bits of the result will have all bits guaranteed zero except the
+  // first.
+  case WebAssemblyISD::I64_ADD128:
+    if (Op.getResNo() == 1) {
+      SDValue LHS_HI = Op.getOperand(1);
+      SDValue RHS_HI = Op.getOperand(3);
+      if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI))
+        Known.Zero.setBitsFrom(1);
+    }
+    break;
   }
 }
 
@@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
     return LowerMUL_LOHI(Op, DAG);
+  case ISD::UADDO:
+    return LowerUADDO(Op, DAG);
   }
 }
 
@@ -1794,10 +1809,33 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op,
   }
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  SDValue Hi =
+  SDValue Lo =
       DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS);
-  SDValue Lo(Hi.getNode(), 1);
-  SDValue Ops[] = {Hi, Lo};
+  SDValue Hi(Lo.getNode(), 1);
+  SDValue Ops[] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled.
+//
+// This enables generating a single wasm instruction for this operation where
+// the upper half of both operands are constant zeros. The upper half of the
+// result is then whether the overflow happened.
+SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Subtarget->hasWideArithmetic());
+  assert(Op.getValueType() == MVT::i64);
+  assert(Op.getOpcode() == ISD::UADDO);
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  SDValue Result =
+      DAG.getNode(WebAssemblyISD::I64_ADD128, DL,
+                  DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero);
+  SDValue CarryI64(Result.getNode(), 1);
+  SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64);
+  SDValue Ops[] = {Result, CarryI32};
   return DAG.getMergeValues(Ops, DL);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 90d31e38a7076..72401a7a259c0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const;
+  SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const;
 
   // Custom DAG combine hooks
   SDValue
diff --git a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
index deff551d0eabd..71974b012a2b6 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
@@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
   %d = trunc i128 %c to i64
   ret i64 %d
 }
+
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
+
+; This is a codegen test to see the effect of overflowing adds on signed
+; integers with wide-arithmetic enabled. At this time it doesn't actually
+; generate anything differently than without wide-arithmetic but this has also
+; been useful for evaluating the proposal.
+define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_s:
+; CHECK:         .functype add_wide_s (i32, i64, i64) -> ()
+; CHECK-NEXT:    .local i64
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    local.tee 3
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.lt_s
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.lt_s
+; CHECK-NEXT:    i32.xor
+; CHECK-NEXT:    i32.store8 8
+; CHECK-NEXT:    # fallthrough-return
+  %pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
+  ret { i64, i1 } %pair
+}
+
+define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_u:
+; CHECK:         .functype add_wide_u (i32, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.store8 8
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  ret { i64, i1 } %pair
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
+; intrinsics. In theory this should optimize better (to the equivalent below)
+; but it doesn't currently.
+define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_intrinsics:
+; CHECK:         .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    i64.store 8
+; CHECK-NEXT:    # fallthrough-return
+  %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %t0 = extractvalue { i64, i1 } %pair, 0
+  %carry1 = extractvalue { i64, i1 } %pair, 1
+
+  %pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
+  %ret1 = extractvalue { i64, i1 } %pair2, 0
+  %carry2 = extractvalue { i64, i1 } %pair2, 1
+
+  %carry1_64 = zext i1 %carry1 to i64
+  %carry2_64 = zext i1 %carry2 to i64
+  %ret2 = add i64 %carry1_64, %carry2_64
+
+  %r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
+  %r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
+  ret { i64, i64 } %r1
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
+; integer addition. This optimizes better than the above currently.
+define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_i128:
+; CHECK:         .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    i64.add128
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.store 8
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    # fallthrough-return
+  %a128 = zext i64 %a to i128
+  %b128 = zext i64 %b to i128
+  %c128 = zext i64 %c to i128
+  %t0 = add i128 %a128, %b128
+  %t1 = add i128 %t0, %c128
+  %result = trunc i128 %t1 to i64
+  %t2 = lshr i128 %t1, 64
+  %carry = trunc i128 %t2 to i64
+
+  %ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
+  %ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
+  ret { i64, i64 } %ret1
+}



More information about the llvm-commits mailing list