[llvm] [WebAssembly] Add more lowerings for wide-arithmetic (PR #132430)
Alex Crichton via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 07:39:30 PDT 2025
https://github.com/alexcrichton updated https://github.com/llvm/llvm-project/pull/132430
>From e42d0bcf257c3ab610898d43fca7bf8f0d5abd74 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex at alexcrichton.com>
Date: Fri, 14 Mar 2025 07:15:52 -0700
Subject: [PATCH] [WebAssembly] Add more lowerings for wide-arithmetic
This commit is the result of investigation and discussion on
WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128`
instruction were discussed but ultimately deferred to a future proposal.
In spite of this though I wanted to apply a few changes to the LLVM
backend here with `wide-arithmetic` enabled for a few minor changes:
* A lowering for the `ISD::UADDO` node is added which uses `add128`
where the upper bits of the two operands are constant zeros and the
result of the 128-bit addition is the result of the overflowing addition.
* The high bits of a `I64_ADD128` node are now flagged as "known zero"
if the upper bits of the inputs are also zero, assisting this `UADDO`
lowering to ensure the backend knows that the carry result is a 1-bit
result.
A few tests were then added to showcase various lowerings for various
operations that can be done with wide-arithmetic. They don't all
optimize super well at this time but I wanted to add them as a reference
here regardless to have them on-hand for future evaluations if
necessary.
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 44 +++++-
.../WebAssembly/WebAssemblyISelLowering.h | 1 +
.../CodeGen/WebAssembly/wide-arithmetic.ll | 134 ++++++++++++++++++
3 files changed, 176 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 9ae46e709d823..82d3b8e292e60 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::SUB, MVT::i128, Custom);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
+ setOperationAction(ISD::UADDO, MVT::i64, Custom);
}
if (Subtarget->hasNontrappingFPToInt())
@@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
}
}
}
+
+ // For 128-bit addition if the upper bits are all zero then it's known that
+ // the upper bits of the result will have all bits guaranteed zero except the
+ // first.
+ case WebAssemblyISD::I64_ADD128:
+ if (Op.getResNo() == 1) {
+ SDValue LHS_HI = Op.getOperand(1);
+ SDValue RHS_HI = Op.getOperand(3);
+ if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI))
+ Known.Zero.setBitsFrom(1);
+ }
+ break;
}
}
@@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
return LowerMUL_LOHI(Op, DAG);
+ case ISD::UADDO:
+ return LowerUADDO(Op, DAG);
}
}
@@ -1794,10 +1809,33 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op,
}
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- SDValue Hi =
+ SDValue Lo =
DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS);
- SDValue Lo(Hi.getNode(), 1);
- SDValue Ops[] = {Hi, Lo};
+ SDValue Hi(Lo.getNode(), 1);
+ SDValue Ops[] = {Lo, Hi};
+ return DAG.getMergeValues(Ops, DL);
+}
+
+// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled.
+//
+// This enables generating a single wasm instruction for this operation where
+// the upper half of both operands are constant zeros. The upper half of the
+// result is then whether the overflow happened.
+SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->hasWideArithmetic());
+ assert(Op.getValueType() == MVT::i64);
+ assert(Op.getOpcode() == ISD::UADDO);
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue Result =
+ DAG.getNode(WebAssemblyISD::I64_ADD128, DL,
+ DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero);
+ SDValue CarryI64(Result.getNode(), 1);
+ SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64);
+ SDValue Ops[] = {Result, CarryI32};
return DAG.getMergeValues(Ops, DL);
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 90d31e38a7076..72401a7a259c0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const;
+ SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const;
// Custom DAG combine hooks
SDValue
diff --git a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
index deff551d0eabd..71974b012a2b6 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll
@@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
%d = trunc i128 %c to i64
ret i64 %d
}
+
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
+
+; This is a codegen test to see the effect of overflowing adds on signed
+; integers with wide-arithmetic enabled. At this time it doesn't actually
+; generate anything differently than without wide-arithmetic but this has also
+; been useful for evaluating the proposal.
+define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_s:
+; CHECK: .functype add_wide_s (i32, i64, i64) -> ()
+; CHECK-NEXT: .local i64
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.add
+; CHECK-NEXT: local.tee 3
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.lt_s
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.lt_s
+; CHECK-NEXT: i32.xor
+; CHECK-NEXT: i32.store8 8
+; CHECK-NEXT: # fallthrough-return
+ %pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
+ ret { i64, i1 } %pair
+}
+
+define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
+; CHECK-LABEL: add_wide_u:
+; CHECK: .functype add_wide_u (i32, i64, i64) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 1
+; CHECK-NEXT: local.set 2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.store8 8
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: # fallthrough-return
+ %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+ ret { i64, i1 } %pair
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
+; intrinsics. In theory this should optimize better (to the equivalent below)
+; but it doesn't currently.
+define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_intrinsics:
+; CHECK: .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 1
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.add
+; CHECK-NEXT: i64.store 8
+; CHECK-NEXT: # fallthrough-return
+ %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+ %t0 = extractvalue { i64, i1 } %pair, 0
+ %carry1 = extractvalue { i64, i1 } %pair, 1
+
+ %pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
+ %ret1 = extractvalue { i64, i1 } %pair2, 0
+ %carry2 = extractvalue { i64, i1 } %pair2, 1
+
+ %carry1_64 = zext i1 %carry1 to i64
+ %carry2_64 = zext i1 %carry2 to i64
+ %ret2 = add i64 %carry1_64, %carry2_64
+
+ %r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
+ %r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
+ ret { i64, i64 } %r1
+}
+
+; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
+; integer addition. This optimizes better than the above currently.
+define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: add_wide3_u_via_i128:
+; CHECK: .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: i64.const 0
+; CHECK-NEXT: i64.add128
+; CHECK-NEXT: local.set 1
+; CHECK-NEXT: local.set 2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i64.store 8
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i64.store 0
+; CHECK-NEXT: # fallthrough-return
+ %a128 = zext i64 %a to i128
+ %b128 = zext i64 %b to i128
+ %c128 = zext i64 %c to i128
+ %t0 = add i128 %a128, %b128
+ %t1 = add i128 %t0, %c128
+ %result = trunc i128 %t1 to i64
+ %t2 = lshr i128 %t1, 64
+ %carry = trunc i128 %t2 to i64
+
+ %ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
+ %ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
+ ret { i64, i64 } %ret1
+}
More information about the llvm-commits
mailing list