[llvm] [AggressiveInstCombine] Fold i64 x i64 -> i128 multiply-by-parts (PR #156879)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 28 07:09:19 PDT 2025
================
@@ -0,0 +1,2571 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
+
+; https://alive2.llvm.org/ce/z/KuJPnU
+define i64 @umulh(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; https://alive2.llvm.org/ce/z/MSo5S_
+define i64 @umulh_variant(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; Commutative ops should match in any order. Ops where operand order has been
+; reversed from above are marked 'commuted'. As per instcombine contributors
+; guide, constants are always canonicalized to RHS, so don't bother commuting
+; constants.
+define i64 @umulh__commuted(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__commuted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo ; commuted
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi ; commuted
+ %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo ; commuted
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 4294967295, %cross_sum ; commuted
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted
+ %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_variant_commuted(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_commuted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %x_lo, %y_lo ; commuted
+ %t1 = mul nuw i64 %x_hi, %y_lo ; commuted
+ %t2 = mul nuw i64 %x_lo, %y_hi ; commuted
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t1, %t0_hi ; commuted
+ %u0_lo = and i64 4294967295, %u0 ; commuted
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %t2, %u0_lo ; commuted
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %t3, %u0_hi ; commuted
+ %hw64 = add nuw i64 %u1_hi, %u2 ; commuted
+ ret i64 %hw64
+}
+
+; https://alive2.llvm.org/ce/z/PPXtkR
+define void @full_mul_int128(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = mul i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP6:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP7:%.*]] = mul i128 [[TMP5]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = trunc i128 [[TMP7]] to i64
+; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ ; Store high 64 bits
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ ; Reconstruct low 64 bits
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ ; Store low 64 bits
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; Negative tests
+
+; 'x_lo' must have exactly 2 uses.
+define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__x_lo(
+; CHECK-NOT: i128
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ call void (...) @llvm.fake.use(i64 %x_lo)
----------------
davemgreen wrote:
It may be fine to transform if the uses are so close to the final operands - it can remove more code overall to do the transform.
https://github.com/llvm/llvm-project/pull/156879
More information about the llvm-commits
mailing list