[llvm] [AggressiveInstCombine] Match long high-half multiply (PR #168396)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 00:45:51 PST 2025
https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/168396
>From 2d35ea57a771363b52f8f153da12a271cbed8793 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 17 Nov 2025 11:29:29 +0000
Subject: [PATCH 1/4] [AggresiveInstCombine] Add various tests for
high-multiply
---
.../AggressiveInstCombine/umulh_carry.ll | 825 +++++
.../AggressiveInstCombine/umulh_carry4.ll | 3227 +++++++++++++++++
.../AggressiveInstCombine/umulh_ladder.ll | 904 +++++
.../AggressiveInstCombine/umulh_ladder4.ll | 600 +++
4 files changed, 5556 insertions(+)
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll
new file mode 100644
index 0000000000000..b9801370028cc
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll
@@ -0,0 +1,825 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s
+
+; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o
+define i32 @mul_carry(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o
+define i128 @mul_carry_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: define i128 @mul_carry_i128(
+; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i128 [[X]], 64
+; CHECK-NEXT: [[AND:%.*]] = and i128 [[X]], 18446744073709551615
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i128 [[Y]], 64
+; CHECK-NEXT: [[AND2:%.*]] = and i128 [[Y]], 18446744073709551615
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i128 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i128 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i128 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i128 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i128 [[MUL4]], 64
+; CHECK-NEXT: [[ADD6:%.*]] = add i128 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i128 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i128 18446744073709551616, i128 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i128 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i128 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i128 [[ADD6]], 64
+; CHECK-NEXT: [[ADD11:%.*]] = add i128 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i128 [[ADD11]]
+;
+entry:
+ %shr = lshr i128 %x, 64
+ %and = and i128 %x, u0xffffffffffffffff
+ %shr1 = lshr i128 %y, 64
+ %and2 = and i128 %y, u0xffffffffffffffff
+ %mul = mul nuw i128 %shr, %and2
+ %mul3 = mul nuw i128 %and, %shr1
+ %add = add i128 %mul, %mul3
+ %mul4 = mul nuw i128 %and, %and2
+ %shr5 = lshr i128 %mul4, 64
+ %add6 = add i128 %add, %shr5
+ %cmp = icmp ult i128 %add6, %mul
+ %cond = select i1 %cmp, i128 u0x10000000000000000, i128 0
+ %mul8 = mul nuw i128 %shr, %shr1
+ %add9 = add nuw i128 %mul8, %cond
+ %shr10 = lshr i128 %add6, 64
+ %add11 = add i128 %add9, %shr10
+ ret i128 %add11
+}
+
+; Carry variant of mul-high. https://alive2.llvm.org/ce/z/G2bD6o
+define <4 x i32> @mul_carry_v4i32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: define <4 x i32> @mul_carry_v4i32(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 16)
+; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[X]], splat (i32 65535)
+; CHECK-NEXT: [[SHR1:%.*]] = lshr <4 x i32> [[Y]], splat (i32 16)
+; CHECK-NEXT: [[AND2:%.*]] = and <4 x i32> [[Y]], splat (i32 65535)
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw <4 x i32> [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw <4 x i32> [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr <4 x i32> [[MUL4]], splat (i32 16)
+; CHECK-NEXT: [[ADD6:%.*]] = add <4 x i32> [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> splat (i32 65536), <4 x i32> zeroinitializer
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw <4 x i32> [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw <4 x i32> [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr <4 x i32> [[ADD6]], splat (i32 16)
+; CHECK-NEXT: [[ADD11:%.*]] = add <4 x i32> [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret <4 x i32> [[ADD11]]
+;
+entry:
+ %shr = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
+ %and = and <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
+ %shr1 = lshr <4 x i32> %y, <i32 16, i32 16, i32 16, i32 16>
+ %and2 = and <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
+ %mul = mul nuw <4 x i32> %shr, %and2
+ %mul3 = mul nuw <4 x i32> %and, %shr1
+ %add = add <4 x i32> %mul, %mul3
+ %mul4 = mul nuw <4 x i32> %and, %and2
+ %shr5 = lshr <4 x i32> %mul4, <i32 16, i32 16, i32 16, i32 16>
+ %add6 = add <4 x i32> %add, %shr5
+ %cmp = icmp ult <4 x i32> %add6, %mul
+ %cond = select <4 x i1> %cmp, <4 x i32> <i32 65536, i32 65536, i32 65536, i32 65536>, <4 x i32> zeroinitializer
+ %mul8 = mul nuw <4 x i32> %shr, %shr1
+ %add9 = add nuw <4 x i32> %mul8, %cond
+ %shr10 = lshr <4 x i32> %add6, <i32 16, i32 16, i32 16, i32 16>
+ %add11 = add <4 x i32> %add9, %shr10
+ ret <4 x i32> %add11
+}
+
+; Check carry against xlyh, not xhyl
+define i32 @mul_carry_xlyh(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_xlyh(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL3]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul3
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+define i32 @mul_carry_comm(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_comm(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[AND2]], [[SHR]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[SHR1]], [[AND]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL3]], [[MUL]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[SHR5]], [[ADD]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i32 [[COND]], [[SHR10]]
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[MUL8]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %and2, %shr
+ %mul3 = mul nuw i32 %shr1, %and
+ %add = add i32 %mul3, %mul
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %shr5, %add
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %shr10 = lshr i32 %add6, 16
+ %add9 = add nuw i32 %cond, %shr10
+ %add11 = add i32 %add9, %mul8
+ ret i32 %add11
+}
+
+
+; Negative tests
+
+
+define i32 @mul_carry_notxlo(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_notxlo(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 32767
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw nsw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw nsw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 32767 ; wrong mask
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+define i32 @mul_carry_notyhi(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_notyhi(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 14
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 14 ; wring shift
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+define i32 @mul_carry_notcarry(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_notcarry(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 0, i32 65536
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 0, i32 65536 ; backwards
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+define i32 @mul_carry_notlolo(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_notlolo(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+define i32 @mul_carry_nothihi(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_nothihi(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL4]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul4, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ ret i32 %add11
+}
+
+; Extra uses
+define i32 @mul_carry_use_carry(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_carry(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[COND]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %cond)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_mulhi(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_mulhi(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL8]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %mul8)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_llh(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_llh(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR10]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %shr5)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_mulll(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_mulll(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL4]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %mul4)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_mullh(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_mullh(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL3]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %mul3)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_mulhl(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_mulhl(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %mul)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_crosssum(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_crosssum(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[ADD9:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[SHR10:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD11]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[ADD9]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD10:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[ADD10]], [[SHR11]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD11]])
+; CHECK-NEXT: ret i32 [[TMP4]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %add)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_lowaccumhi(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_lowaccumhi(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR11]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %shr10)
+ ret i32 %add11
+}
+
+define i32 @mul_carry_use_lowaccum(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_carry_use_lowaccum(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
+; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
+; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
+; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
+; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
+; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
+; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
+; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
+; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD6]])
+; CHECK-NEXT: ret i32 [[ADD11]]
+;
+entry:
+ %shr = lshr i32 %x, 16
+ %and = and i32 %x, 65535
+ %shr1 = lshr i32 %y, 16
+ %and2 = and i32 %y, 65535
+ %mul = mul nuw i32 %shr, %and2
+ %mul3 = mul nuw i32 %and, %shr1
+ %add = add i32 %mul, %mul3
+ %mul4 = mul nuw i32 %and, %and2
+ %shr5 = lshr i32 %mul4, 16
+ %add6 = add i32 %add, %shr5
+ %cmp = icmp ult i32 %add6, %mul
+ %cond = select i1 %cmp, i32 65536, i32 0
+ %mul8 = mul nuw i32 %shr, %shr1
+ %add9 = add nuw i32 %mul8, %cond
+ %shr10 = lshr i32 %add6, 16
+ %add11 = add i32 %add9, %shr10
+ call void (...) @llvm.fake.use(i32 %add6)
+ ret i32 %add11
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll
new file mode 100644
index 0000000000000..d92434a7a7ea5
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll
@@ -0,0 +1,3227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s
+
+; https://alive2.llvm.org/ce/z/KuJPnU
+define i64 @umulh(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; Commutative ops should match in any order. Ops where operand order has been
+; reversed from above are marked 'commuted'. As per instcombine contributors
+; guide, constants are always canonicalized to RHS, so don't bother commuting
+; constants.
+define i64 @umulh__commuted(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__commuted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_LO_X_HI]], [[Y_HI_X_LO]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[Y_LO_X_LO_HI]], [[CROSS_SUM_LO]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[CARRY]], [[INTERMEDIATE]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[LOW_ACCUM_HI]], [[INTERMEDIATE_PLUS_CARRY]]
+; CHECK-NEXT: ret i64 [[TMP4]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %x_hi, %y_lo ; commuted
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %x_lo, %y_hi ; commuted
+ %y_lo_x_lo = mul nuw i64 %x_lo, %y_lo ; commuted
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_lo_x_hi, %y_hi_x_lo ; commuted
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %y_lo_x_lo_hi, %cross_sum_lo ; commuted
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %y_hi_x_hi, %cross_sum_hi ; commuted
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %carry, %intermediate ; commuted
+ %hw64 = add i64 %low_accum_hi, %intermediate_plus_carry ; commuted
+
+ ret i64 %hw64
+}
+
+define i32 @mulh_src32(i32 %x, i32 %y) {
+ ; Extract low and high 16 bits
+; CHECK-LABEL: define i32 @mulh_src32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i32 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i32 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i32 65536, i32 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i32 [[Y_LO_X_LO]], 16
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i32 [[CROSS_SUM]], 65535
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i32 [[CROSS_SUM]], 16
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i32 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i32 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i32 [[LOW_ACCUM]], 16
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i32 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i32 [[TMP5]]
+;
+ %x_lo = and i32 %x, u0xffff ; x & 0xffffffff
+ %y_lo = and i32 %y, u0xffff ; y & 0xffffffff
+ %x_hi = lshr i32 %x, 16 ; x >> 16
+ %y_hi = lshr i32 %y, 16 ; y >> 16
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i32 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i32 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i32 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i32 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i32 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i32 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i32 u0x10000, i32 0 ; if overflow, add 1 << 16
+
+ ; High 16 bits of low product
+ %y_lo_x_lo_hi = lshr i32 %y_lo_x_lo, 16
+
+ ; Low and high 16 bits of cross_sum
+ %cross_sum_lo = and i32 %cross_sum, u0xffff
+ %cross_sum_hi = lshr i32 %cross_sum, 16
+
+ %low_accum = add nuw nsw i32 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i32 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i32 %low_accum, 16
+ %intermediate_plus_carry = add i32 %intermediate, %carry
+ %hw64 = add i32 %intermediate_plus_carry, %low_accum_hi
+
+ ret i32 %hw64
+}
+
+define i128 @mulh_src128(i128 %x, i128 %y) {
+ ; Extract low and high 64 bits
+; CHECK-LABEL: define i128 @mulh_src128(
+; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615
+; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i128 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i128 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i128 18446744073709551616, i128 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i128 [[Y_LO_X_LO]], 64
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i128 [[CROSS_SUM]], 18446744073709551615
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i128 [[CROSS_SUM]], 64
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i128 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i128 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i128 [[LOW_ACCUM]], 64
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i128 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i128 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i128 [[HW64]]
+;
+ %x_lo = and i128 %x, u0xffffffffffffffff ; x & 0xffffffff
+ %y_lo = and i128 %y, u0xffffffffffffffff ; y & 0xffffffff
+ %x_hi = lshr i128 %x, 64 ; x >> 16
+ %y_hi = lshr i128 %y, 64 ; y >> 16
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i128 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i128 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i128 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i128 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i128 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i128 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i128 u0x10000000000000000, i128 0 ; if overflow, add 1 << 16
+
+ ; High 16 bits of low product
+ %y_lo_x_lo_hi = lshr i128 %y_lo_x_lo, 64
+
+ ; Low and high 16 bits of cross_sum
+ %cross_sum_lo = and i128 %cross_sum, u0xffffffffffffffff
+ %cross_sum_hi = lshr i128 %cross_sum, 64
+
+ %low_accum = add nuw nsw i128 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i128 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i128 %low_accum, 64
+ %intermediate_plus_carry = add i128 %intermediate, %carry
+ %hw64 = add i128 %intermediate_plus_carry, %low_accum_hi
+
+ ret i128 %hw64
+}
+
+define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) {
+ ; Extract low and high 16 bits
+; CHECK-LABEL: define <2 x i32> @mulh_v2i32(
+; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535)
+; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535)
+; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16)
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16)
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add <2 x i32> [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult <2 x i32> [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select <2 x i1> [[CARRY_OUT]], <2 x i32> splat (i32 65536), <2 x i32> zeroinitializer
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr <2 x i32> [[Y_LO_X_LO]], splat (i32 16)
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and <2 x i32> [[CROSS_SUM]], splat (i32 65535)
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr <2 x i32> [[CROSS_SUM]], splat (i32 16)
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw <2 x i32> [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw <2 x i32> [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr <2 x i32> [[LOW_ACCUM]], splat (i32 16)
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add <2 x i32> [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add <2 x i32> [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret <2 x i32> [[HW64]]
+;
+ %x_lo = and <2 x i32> %x, <i32 u0xffff, i32 u0xffff>
+ %y_lo = and <2 x i32> %y, <i32 u0xffff, i32 u0xffff>
+ %x_hi = lshr <2 x i32> %x, <i32 16, i32 16>
+ %y_hi = lshr <2 x i32> %y, <i32 16, i32 16>
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw <2 x i32> %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw <2 x i32> %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw <2 x i32> %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw <2 x i32> %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add <2 x i32> %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult <2 x i32> %cross_sum, %y_lo_x_hi
+ %carry = select <2 x i1> %carry_out, <2 x i32> <i32 u0x10000, i32 u0x10000>, <2 x i32> <i32 0, i32 0>
+
+ ; High 16 bits of low product
+ %y_lo_x_lo_hi = lshr <2 x i32> %y_lo_x_lo, <i32 16, i32 16>
+
+ ; Low and high 16 bits of cross_sum
+ %cross_sum_lo = and <2 x i32> %cross_sum, <i32 u0xffff, i32 u0xffff>
+ %cross_sum_hi = lshr <2 x i32> %cross_sum, <i32 16, i32 16>
+
+ %low_accum = add nuw nsw <2 x i32> %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw <2 x i32> %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr <2 x i32> %low_accum, <i32 16, i32 16>
+ %intermediate_plus_carry = add <2 x i32> %intermediate, %carry
+ %hw64 = add <2 x i32> %intermediate_plus_carry, %low_accum_hi
+
+ ret <2 x i32> %hw64
+}
+
+; https://alive2.llvm.org/ce/z/PPXtkR
+define void @full_mul_int128(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ ; Store high 64 bits
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ ; Reconstruct low 64 bits
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ ; Store low 64 bits
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+
+; Negative tests
+
+define i64 @umulh_notandx(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notandx(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967294 ; x & 0xfffffffe
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_notandy(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notandy(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967294
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967294 ; y & 0xfffffffe
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_notshiftx(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notshiftx(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 16
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 16 ; x >> 16
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_notshifty(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notshifty(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 16 ; y >> 16
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_notcarry(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notcarry(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967295, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967295, i64 0 ; if overflow, add wrong value
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_notxlo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notxlo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x ; y_lo * x
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+define i64 @umulh_notcrosssum(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_notcrosssum(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = shl i64 [[Y_HI_X_LO]], 1
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967294
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_hi_x_lo ; wrong crosssum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+
+
+; Uses tests.
+
+; 'x_lo' can have more than 2 uses.
+define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]])
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ call void (...) @llvm.fake.use(i64 %x_lo)
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'y_hi' can have more than 2 uses.
+define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]])
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+ call void (...) @llvm.fake.use(i64 %y_hi)
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'y_hi * x_hi' must have no more than 2 uses.
+define i64 @umulh__mul_use__y_lo_x_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]])
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'y_hi * x_hi' must have single use.
+define i64 @umulh__mul_use__y_hi_x_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'y_hi * x_lo' must have single use.
+define i64 @umulh__mul_use__y_hi_x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_hi_x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]])
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'y_lo * x_lo' has a single use if only doing high part of multiply and 2 uses
+; when doing both low/high parts. Doing the optimization when only doing the
+; high part and there's a 2nd unrelated use here still results in less
+; instructions and is likely profitable, so this seems ok.
+define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+ call void (...) @llvm.fake.use(i64 %y_lo_x_lo)
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'cross_sum' must have no more than 3 uses.
+define i64 @umulh__mul_use__cross_sum(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]])
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ call void (...) @llvm.fake.use(i64 %cross_sum)
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'carry_out' must have single use.
+define i64 @umulh__mul_use__carry_out(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__carry_out(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ call void (...) @llvm.fake.use(i1 %carry_out)
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'carry' must have single use.
+define i64 @umulh__mul_use__carry(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__carry(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]])
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+ call void (...) @llvm.fake.use(i64 %carry)
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'y_lo_x_lo_hi' must have single use.
+define i64 @umulh__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__y_lo_x_lo_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]])
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+ call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'cross_sum_lo' must have single use.
+define i64 @umulh__mul_use__cross_sum_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]])
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ call void (...) @llvm.fake.use(i64 %cross_sum_lo)
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'cross_sum_hi' must have single use.
+define i64 @umulh__mul_use__cross_sum_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__cross_sum_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+ call void (...) @llvm.fake.use(i64 %cross_sum_hi)
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'low_accum' has a single use if only doing high part of multiply and 2 uses
+; when doing both low/high parts. Unrelated use here, but still seems
+; profitable.
+define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__low_accum(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+ call void (...) @llvm.fake.use(i64 %low_accum)
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'intermediate' must have single use.
+define i64 @umulh__mul_use__intermediate(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__intermediate(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE]])
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ call void (...) @llvm.fake.use(i64 %intermediate)
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'low_accum_hi' must have single use.
+define i64 @umulh__mul_use__low_accum_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__low_accum_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ call void (...) @llvm.fake.use(i64 %low_accum_hi)
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+
+ ret i64 %hw64
+}
+
+; 'intermediate_plus_carry' must have single use.
+define i64 @umulh__mul_use__intermediate_plus_carry(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh__mul_use__intermediate_plus_carry(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[INTERMEDIATE_PLUS_CARRY]])
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ ; Extract low and high 32 bits
+ %x_lo = and i64 %x, 4294967295 ; x & 0xffffffff
+ %y_lo = and i64 %y, 4294967295 ; y & 0xffffffff
+ %x_hi = lshr i64 %x, 32 ; x >> 32
+ %y_hi = lshr i64 %y, 32 ; y >> 32
+
+ ; Cross products
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi ; y_lo * x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi ; y_hi * x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo ; y_hi * x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo ; y_lo * x_lo
+
+ ; Add cross terms
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi ; full 64-bit sum
+
+ ; Carry if overflowed
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0 ; if overflow, add 1 << 32
+
+ ; High 32 bits of low product
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ ; Low and high 32 bits of cross_sum
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ ; Final result accumulation
+ %intermediate = add nuw i64 %cross_sum_hi, %y_hi_x_hi
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %intermediate_plus_carry = add i64 %intermediate, %carry
+ %hw64 = add i64 %intermediate_plus_carry, %low_accum_hi
+ call void (...) @llvm.fake.use(i64 %intermediate_plus_carry)
+
+ ret i64 %hw64
+}
+
+
+; 'x_lo' can have multiple uses.
+define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]])
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ call void (...) @llvm.fake.use(i64 %x_lo)
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_lo' can have multiple uses.
+define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO]])
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ call void (...) @llvm.fake.use(i64 %y_lo)
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'x_hi' can have multiple uses.
+define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_HI]])
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ call void (...) @llvm.fake.use(i64 %x_hi)
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_hi' can have multiple uses.
+define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]])
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+ call void (...) @llvm.fake.use(i64 %y_hi)
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_lo_x_hi' must have exactly 2 uses.
+define void @full_mul_int128__mul_use__y_lo_x_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_HI]])
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ call void (...) @llvm.fake.use(i64 %y_lo_x_hi)
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_hi_x_hi' must have single use.
+define void @full_mul_int128__mul_use__y_hi_x_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_HI]])
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ call void (...) @llvm.fake.use(i64 %y_hi_x_hi)
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_hi_x_lo' must have single use.
+define void @full_mul_int128__mul_use__y_hi_x_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi_x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI_X_LO]])
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ call void (...) @llvm.fake.use(i64 %y_hi_x_lo)
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_lo_x_lo' we allow multiple uses on y_lo_x_lo.
+; TODO does not simplify like it should?
+define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[TMP6]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[TMP6]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[TMP6]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI1:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS1:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS1]], [[LOW_ACCUM_HI1]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+ call void (...) @llvm.fake.use(i64 %y_lo_x_lo)
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'cross_sum' must have no more than 3 uses.
+define void @full_mul_int128__mul_use__cross_sum(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM]])
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+ call void (...) @llvm.fake.use(i64 %cross_sum)
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'carry_out' must have single use.
+define void @full_mul_int128__mul_use__carry_out(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__carry_out(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i1 [[CARRY_OUT]])
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ call void (...) @llvm.fake.use(i1 %carry_out)
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'carry' must have single use.
+define void @full_mul_int128__mul_use__carry(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__carry(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CARRY]])
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+ call void (...) @llvm.fake.use(i64 %carry)
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'y_lo_x_lo_hi' must have single use.
+define void @full_mul_int128__mul_use__y_lo_x_lo_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo_x_lo_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO_HI]])
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+ call void (...) @llvm.fake.use(i64 %y_lo_x_lo_hi)
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'cross_sum_lo' must have single use.
+define void @full_mul_int128__mul_use__cross_sum_lo(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_LO]])
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ call void (...) @llvm.fake.use(i64 %cross_sum_lo)
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'cross_sum_hi' must have single use.
+define void @full_mul_int128__mul_use__cross_sum_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__cross_sum_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[CROSS_SUM_HI]])
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+ call void (...) @llvm.fake.use(i64 %cross_sum_hi)
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'low_accum' must have exactly 2 uses if doing high multiply.
+define void @full_mul_int128__mul_use__low_accum(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+ call void (...) @llvm.fake.use(i64 %low_accum)
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'upper_mid' must have single use.
+define void @full_mul_int128__mul_use__upper_mid(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[UPPER_MID]])
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP9:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP9]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ call void (...) @llvm.fake.use(i64 %upper_mid)
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'low_accum_hi' must have single use.
+define void @full_mul_int128__mul_use__low_accum_hi(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ call void (...) @llvm.fake.use(i64 %low_accum_hi)
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'upper_mid_with_cross' must have single use.
+define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__upper_mid_with_cross(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_HI]])
+; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[TMP4:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[TMP4]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ call void (...) @llvm.fake.use(i64 %low_accum_hi)
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
+; 'low_accum_shifted' can have multiple uses.
+define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
+; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
+; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
+; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
+; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
+; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
+; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
+; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]])
+; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
+; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
+; CHECK-NEXT: ret void
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %y_lo_x_hi = mul nuw i64 %y_lo, %x_hi
+ %y_hi_x_hi = mul nuw i64 %y_hi, %x_hi
+ %y_hi_x_lo = mul nuw i64 %y_hi, %x_lo
+ %y_lo_x_lo = mul nuw i64 %y_lo, %x_lo
+
+ %cross_sum = add i64 %y_hi_x_lo, %y_lo_x_hi
+
+ %carry_out = icmp ult i64 %cross_sum, %y_lo_x_hi
+ %carry = select i1 %carry_out, i64 4294967296, i64 0
+
+ %y_lo_x_lo_hi = lshr i64 %y_lo_x_lo, 32
+
+ %cross_sum_lo = and i64 %cross_sum, 4294967295
+ %cross_sum_hi = lshr i64 %cross_sum, 32
+
+ %low_accum = add nuw nsw i64 %cross_sum_lo, %y_lo_x_lo_hi
+
+ %upper_mid = add nuw i64 %y_hi_x_hi, %carry
+ %low_accum_hi = lshr i64 %low_accum, 32
+ %upper_mid_with_cross = add i64 %upper_mid, %cross_sum_hi
+ %hw64 = add i64 %upper_mid_with_cross, %low_accum_hi
+
+ %hi_ptr = getelementptr inbounds i8, ptr %p, i64 8
+ store i64 %hw64, ptr %hi_ptr, align 8
+
+ %low_accum_shifted = shl i64 %low_accum, 32
+ call void (...) @llvm.fake.use(i64 %low_accum_shifted)
+ %y_lo_x_lo_lo = and i64 %y_lo_x_lo, 4294967295
+ %lw64 = or disjoint i64 %low_accum_shifted, %y_lo_x_lo_lo
+
+ store i64 %lw64, ptr %p, align 8
+
+ ret void
+}
+
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
new file mode 100644
index 0000000000000..6e56eb86516c5
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
@@ -0,0 +1,904 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s
+
+; https://alive2.llvm.org/ce/z/MSo5S_
+define i64 @umulh_variant(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[TMP5]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+define i32 @umulh_variant_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @umulh_variant_i32(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i32 [[T0]], 16
+; CHECK-NEXT: [[U0:%.*]] = add nuw i32 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i32 [[U0]], 65535
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i32 [[U0]], 16
+; CHECK-NEXT: [[U1:%.*]] = add nuw i32 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i32 [[U1]], 16
+; CHECK-NEXT: [[U2:%.*]] = add nuw i32 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i32 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i32 [[HW64]]
+;
+ %x_lo = and i32 %x, u0xffff
+ %y_lo = and i32 %y, u0xffff
+ %x_hi = lshr i32 %x, 16
+ %y_hi = lshr i32 %y, 16
+
+ %t0 = mul nuw i32 %y_lo, %x_lo
+ %t1 = mul nuw i32 %y_lo, %x_hi
+ %t2 = mul nuw i32 %y_hi, %x_lo
+ %t3 = mul nuw i32 %y_hi, %x_hi
+
+ %t0_hi = lshr i32 %t0, 16
+
+ %u0 = add nuw i32 %t0_hi, %t1
+ %u0_lo = and i32 %u0, u0xffff
+ %u0_hi = lshr i32 %u0, 16
+ %u1 = add nuw i32 %u0_lo, %t2
+ %u1_hi = lshr i32 %u1, 16
+ %u2 = add nuw i32 %u0_hi, %t3
+ %hw64 = add nuw i32 %u2, %u1_hi
+ ret i32 %hw64
+}
+
+define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: define <2 x i32> @umulh_variant_v2i32(
+; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535)
+; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535)
+; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16)
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16)
+; CHECK-NEXT: [[T0:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr <2 x i32> [[T0]], splat (i32 16)
+; CHECK-NEXT: [[U0:%.*]] = add nuw <2 x i32> [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and <2 x i32> [[U0]], splat (i32 65535)
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr <2 x i32> [[U0]], splat (i32 16)
+; CHECK-NEXT: [[U1:%.*]] = add nuw <2 x i32> [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr <2 x i32> [[U1]], splat (i32 16)
+; CHECK-NEXT: [[U2:%.*]] = add nuw <2 x i32> [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw <2 x i32> [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret <2 x i32> [[HW64]]
+;
+ %x_lo = and <2 x i32> %x, <i32 u0xffff, i32 u0xffff>
+ %y_lo = and <2 x i32> %y, <i32 u0xffff, i32 u0xffff>
+ %x_hi = lshr <2 x i32> %x, <i32 16, i32 16>
+ %y_hi = lshr <2 x i32> %y, <i32 16, i32 16>
+
+ %t0 = mul nuw <2 x i32> %y_lo, %x_lo
+ %t1 = mul nuw <2 x i32> %y_lo, %x_hi
+ %t2 = mul nuw <2 x i32> %y_hi, %x_lo
+ %t3 = mul nuw <2 x i32> %y_hi, %x_hi
+
+ %t0_hi = lshr <2 x i32> %t0, <i32 16, i32 16>
+
+ %u0 = add nuw <2 x i32> %t0_hi, %t1
+ %u0_lo = and <2 x i32> %u0, <i32 u0xffff, i32 u0xffff>
+ %u0_hi = lshr <2 x i32> %u0, <i32 16, i32 16>
+ %u1 = add nuw <2 x i32> %u0_lo, %t2
+ %u1_hi = lshr <2 x i32> %u1, <i32 16, i32 16>
+ %u2 = add nuw <2 x i32> %u0_hi, %t3
+ %hw64 = add nuw <2 x i32> %u2, %u1_hi
+ ret <2 x i32> %hw64
+}
+
+define i128 @umulh_variant_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: define i128 @umulh_variant_i128(
+; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615
+; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i128 [[T0]], 64
+; CHECK-NEXT: [[U0:%.*]] = add nuw i128 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i128 [[U0]], 18446744073709551615
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i128 [[U0]], 64
+; CHECK-NEXT: [[U1:%.*]] = add nuw i128 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i128 [[U1]], 64
+; CHECK-NEXT: [[U2:%.*]] = add nuw i128 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i128 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i128 [[HW64]]
+;
+ %x_lo = and i128 %x, u0xffffffffffffffff
+ %y_lo = and i128 %y, u0xffffffffffffffff
+ %x_hi = lshr i128 %x, 64
+ %y_hi = lshr i128 %y, 64
+
+ %t0 = mul nuw i128 %y_lo, %x_lo
+ %t1 = mul nuw i128 %y_lo, %x_hi
+ %t2 = mul nuw i128 %y_hi, %x_lo
+ %t3 = mul nuw i128 %y_hi, %x_hi
+
+ %t0_hi = lshr i128 %t0, 64
+
+ %u0 = add nuw i128 %t0_hi, %t1
+ %u0_lo = and i128 %u0, u0xffffffffffffffff
+ %u0_hi = lshr i128 %u0, 64
+ %u1 = add nuw i128 %u0_lo, %t2
+ %u1_hi = lshr i128 %u1, 64
+ %u2 = add nuw i128 %u0_hi, %t3
+ %hw64 = add nuw i128 %u2, %u1_hi
+ ret i128 %hw64
+}
+
+define i64 @umulh_variant_commuted(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_commuted(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[X_HI]], [[Y_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T1]], [[T0_HI]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[T2]], [[U0_LO]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw nsw i64 [[U1_HI]], [[U0_HI]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[T3]], [[U2]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %x_lo, %y_lo
+ %t1 = mul nuw i64 %x_lo, %y_hi
+ %t2 = mul nuw i64 %x_hi, %y_lo
+ %t3 = mul nuw i64 %x_hi, %y_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t1, %t0_hi
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %t2, %u0_lo
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u1_hi, %u0_hi
+ %hw64 = add nuw i64 %t3, %u2
+ ret i64 %hw64
+}
+
+
+
+; Negative tests
+
+define i64 @umulh_variant_notlox(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_notlox(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967294
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967294
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967294 ; wrong imm
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+define i64 @umulh_variant_nothiy(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_nothiy(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 16
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 16 ; wrong imm
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+define i64 @umulh_variant_notlowacc(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_notlowacc(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967294
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967294 ; wrong imm
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+define i64 @umulh_variant_notll(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant_notll(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t3 = mul nuw i64 %y_lo, %x_lo ; swapped lolo and hihi
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t0 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+
+
+; Use checks
+
+; 't0' can have more than one use.
+define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0]])
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ call void (...) @llvm.fake.use(i64 %t0)
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 't1' can have more than one use.
+define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T1]])
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ call void (...) @llvm.fake.use(i64 %t1)
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 't2' can have more than one use.
+define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T2]])
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ call void (...) @llvm.fake.use(i64 %t2)
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 't3' must have single use.
+define i64 @umulh_variant__mul_use__t3(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t3(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T3]])
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+ call void (...) @llvm.fake.use(i64 %t3)
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 't0_hi' must have single use.
+define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__t0_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0_HI]])
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+ call void (...) @llvm.fake.use(i64 %t0_hi)
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 'u0' must have single use.
+define i64 @umulh_variant__mul_use__u0(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0]])
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ call void (...) @llvm.fake.use(i64 %u0)
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 'u0_lo' must have single use.
+define i64 @umulh_variant__mul_use__u0_lo(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_lo(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0_LO]])
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ call void (...) @llvm.fake.use(i64 %u0_lo)
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 'u0_hi' must have single use.
+define i64 @umulh_variant__mul_use__u0_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u0_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U0_HI]])
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ call void (...) @llvm.fake.use(i64 %u0_hi)
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 'u1' must have single use.
+define i64 @umulh_variant__mul_use__u1(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U1]])
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ call void (...) @llvm.fake.use(i64 %u1)
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 'u1_hi' must have single use.
+define i64 @umulh_variant__mul_use__u1_hi(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u1_hi(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
+; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U1_HI]])
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ call void (...) @llvm.fake.use(i64 %u1_hi)
+ %u2 = add nuw i64 %u0_hi, %t3
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
+
+; 'u2' must have single use.
+define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) {
+; CHECK-LABEL: define i64 @umulh_variant__mul_use__u2(
+; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
+; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
+; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
+; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
+; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
+; CHECK-NEXT: [[U0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
+; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
+; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
+; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
+; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_HI]], [[T1]]
+; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U1]], 4294967295
+; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
+; CHECK-NEXT: [[U3:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
+; CHECK-NEXT: [[U1_HI1:%.*]] = lshr i64 [[U3]], 32
+; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U1_HI]], [[T3]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[U2]])
+; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI1]]
+; CHECK-NEXT: ret i64 [[HW64]]
+;
+ %x_lo = and i64 %x, 4294967295
+ %y_lo = and i64 %y, 4294967295
+ %x_hi = lshr i64 %x, 32
+ %y_hi = lshr i64 %y, 32
+
+ %t0 = mul nuw i64 %y_lo, %x_lo
+ %t1 = mul nuw i64 %y_lo, %x_hi
+ %t2 = mul nuw i64 %y_hi, %x_lo
+ %t3 = mul nuw i64 %y_hi, %x_hi
+
+ %t0_hi = lshr i64 %t0, 32
+
+ %u0 = add nuw i64 %t0_hi, %t1
+ %u0_lo = and i64 %u0, 4294967295
+ %u0_hi = lshr i64 %u0, 32
+ %u1 = add nuw i64 %u0_lo, %t2
+ %u1_hi = lshr i64 %u1, 32
+ %u2 = add nuw i64 %u0_hi, %t3
+ call void (...) @llvm.fake.use(i64 %u2)
+ %hw64 = add nuw i64 %u2, %u1_hi
+ ret i64 %hw64
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll
new file mode 100644
index 0000000000000..5f84bc4e93b82
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll
@@ -0,0 +1,600 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=aggressive-instcombine,instcombine -S | FileCheck %s
+
+; Ladder4 variant. https://alive2.llvm.org/ce/z/tExFRs
+define i32 @mul_ladder4(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]]
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]]
+; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %xl, %yl
+ %mullh = mul nuw i32 %xl, %yh
+ %mulhl = mul nuw i32 %xh, %yl
+ %mulhh = mul nuw i32 %xh, %yh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %shr8, %conv10
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %add, %conv12
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %add16 = add nuw i32 %mulhh, %shr15
+ %shr17 = lshr i32 %mulhl, 16
+ %add18 = add nuw i32 %add16, %shr17
+ %add19 = add nuw i32 %add18, %shr14
+ ret i32 %add19
+}
+
+define <2 x i32> @mul_ladder4_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: define <2 x i32> @mul_ladder4_v2i32(
+; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and <2 x i32> [[X]], splat (i32 65535)
+; CHECK-NEXT: [[XH:%.*]] = lshr <2 x i32> [[X]], splat (i32 16)
+; CHECK-NEXT: [[YL:%.*]] = and <2 x i32> [[Y]], splat (i32 65535)
+; CHECK-NEXT: [[YH:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16)
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw <2 x i32> [[XL]], [[YL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw <2 x i32> [[XL]], [[YH]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw <2 x i32> [[XH]], [[YL]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw <2 x i32> [[XH]], [[YH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr <2 x i32> [[MULLL]], splat (i32 16)
+; CHECK-NEXT: [[CONV10:%.*]] = and <2 x i32> [[MULLH]], splat (i32 65535)
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i32> [[SHR8]], [[CONV10]]
+; CHECK-NEXT: [[CONV12:%.*]] = and <2 x i32> [[MULHL]], splat (i32 65535)
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw <2 x i32> [[ADD]], [[CONV12]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr <2 x i32> [[ADD13]], splat (i32 16)
+; CHECK-NEXT: [[SHR15:%.*]] = lshr <2 x i32> [[MULLH]], splat (i32 16)
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw <2 x i32> [[MULHH]], [[SHR15]]
+; CHECK-NEXT: [[SHR17:%.*]] = lshr <2 x i32> [[MULHL]], splat (i32 16)
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw <2 x i32> [[ADD16]], [[SHR17]]
+; CHECK-NEXT: [[ADD19:%.*]] = add nuw <2 x i32> [[ADD18]], [[SHR14]]
+; CHECK-NEXT: ret <2 x i32> [[ADD19]]
+;
+entry:
+ %xl = and <2 x i32> %x, <i32 65535, i32 65535>
+ %xh = lshr <2 x i32> %x, <i32 16, i32 16>
+ %yl = and <2 x i32> %y, <i32 65535, i32 65535>
+ %yh = lshr <2 x i32> %y, <i32 16, i32 16>
+ %mulll = mul nuw <2 x i32> %xl, %yl
+ %mullh = mul nuw <2 x i32> %xl, %yh
+ %mulhl = mul nuw <2 x i32> %xh, %yl
+ %mulhh = mul nuw <2 x i32> %xh, %yh
+ %shr8 = lshr <2 x i32> %mulll, <i32 16, i32 16>
+ %conv10 = and <2 x i32> %mullh, <i32 65535, i32 65535>
+ %add = add nuw nsw <2 x i32> %shr8, %conv10
+ %conv12 = and <2 x i32> %mulhl, <i32 65535, i32 65535>
+ %add13 = add nuw nsw <2 x i32> %add, %conv12
+ %shr14 = lshr <2 x i32> %add13, <i32 16, i32 16>
+ %shr15 = lshr <2 x i32> %mullh, <i32 16, i32 16>
+ %add16 = add nuw <2 x i32> %mulhh, %shr15
+ %shr17 = lshr <2 x i32> %mulhl, <i32 16, i32 16>
+ %add18 = add nuw <2 x i32> %add16, %shr17
+ %add19 = add nuw <2 x i32> %add18, %shr14
+ ret <2 x i32> %add19
+}
+
+define i128 @mul_ladder4_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: define i128 @mul_ladder4_i128(
+; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i128 [[X]], 18446744073709551615
+; CHECK-NEXT: [[XH:%.*]] = lshr i128 [[X]], 64
+; CHECK-NEXT: [[YL:%.*]] = and i128 [[Y]], 18446744073709551615
+; CHECK-NEXT: [[YH:%.*]] = lshr i128 [[Y]], 64
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i128 [[XL]], [[YL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i128 [[XL]], [[YH]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i128 [[XH]], [[YL]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i128 [[XH]], [[YH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i128 [[MULLL]], 64
+; CHECK-NEXT: [[CONV10:%.*]] = and i128 [[MULLH]], 18446744073709551615
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i128 [[SHR8]], [[CONV10]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i128 [[MULHL]], 18446744073709551615
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i128 [[ADD]], [[CONV12]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i128 [[ADD13]], 64
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i128 [[MULLH]], 64
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw i128 [[MULHH]], [[SHR15]]
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i128 [[MULHL]], 64
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw i128 [[ADD16]], [[SHR17]]
+; CHECK-NEXT: [[ADD19:%.*]] = add nuw i128 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: ret i128 [[ADD19]]
+;
+entry:
+ %xl = and i128 %x, u0xffffffffffffffff
+ %xh = lshr i128 %x, 64
+ %yl = and i128 %y, u0xffffffffffffffff
+ %yh = lshr i128 %y, 64
+ %mulll = mul nuw i128 %xl, %yl
+ %mullh = mul nuw i128 %xl, %yh
+ %mulhl = mul nuw i128 %xh, %yl
+ %mulhh = mul nuw i128 %xh, %yh
+ %shr8 = lshr i128 %mulll, 64
+ %conv10 = and i128 %mullh, u0xffffffffffffffff
+ %add = add nuw nsw i128 %shr8, %conv10
+ %conv12 = and i128 %mulhl, u0xffffffffffffffff
+ %add13 = add nuw nsw i128 %add, %conv12
+ %shr14 = lshr i128 %add13, 64
+ %shr15 = lshr i128 %mullh, 64
+ %add16 = add nuw i128 %mulhh, %shr15
+ %shr17 = lshr i128 %mulhl, 64
+ %add18 = add nuw i128 %add16, %shr17
+ %add19 = add nuw i128 %add18, %shr14
+ ret i128 %add19
+}
+
+define i32 @mul_ladder4_commutted(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_commutted(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add nuw i32 %shr14, %shr17
+ %add18 = add nuw i32 %add16, %shr15
+ %add19 = add nuw i32 %mulhh, %add18
+ ret i32 %add19
+}
+
+define i32 @mul_ladder4_swap_hl_lh(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_swap_hl_lh(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULLH]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]]
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULLH]], 16
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]]
+; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %xl, %yl
+ %mullh = mul nuw i32 %xl, %yh
+ %mulhl = mul nuw i32 %xh, %yl
+ %mulhh = mul nuw i32 %xh, %yh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mulhl, 65535
+ %add = add nuw nsw i32 %shr8, %conv10
+ %conv12 = and i32 %mullh, 65535
+ %add13 = add nuw nsw i32 %add, %conv12
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mulhl, 16
+ %add16 = add nuw i32 %mulhh, %shr15
+ %shr17 = lshr i32 %mullh, 16
+ %add18 = add nuw i32 %add16, %shr17
+ %add19 = add nuw i32 %add18, %shr14
+ ret i32 %add19
+}
+
+
+; Negative tests
+
+define i32 @mul_ladder4_notlhhl(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_notlhhl(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]]
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]]
+; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %xl, %yl
+ %mullh = mul nuw i32 %xl, %yh
+ %mulhl = mul nuw i32 %xh, %yl
+ %mulhh = mul nuw i32 %xh, %yh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mulhl, 65535
+ %add = add nuw nsw i32 %shr8, %conv10
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %add, %conv12
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mulhl, 16
+ %add16 = add nuw i32 %mulhh, %shr15
+ %shr17 = lshr i32 %mulhl, 16
+ %add18 = add nuw i32 %add16, %shr17
+ %add19 = add nuw i32 %add18, %shr14
+ ret i32 %add19
+}
+
+
+
+
+
+
+; Extra uses
+
+define i32 @mul_ladder4_use_add13(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_use_add13(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD13]])
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add i32 %shr14, %shr17
+ %add18 = add i32 %add16, %shr15
+ %add19 = add i32 %mulhh, %add18
+ call void (...) @llvm.fake.use(i32 %add13)
+ ret i32 %add19
+}
+
+define i32 @mul_ladder4_use_conv12(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_use_conv12(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV12]], [[SHR8]]
+; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV13]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL1]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[CONV13]])
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add i32 %shr14, %shr17
+ %add18 = add i32 %add16, %shr15
+ %add19 = add i32 %mulhh, %add18
+ call void (...) @llvm.fake.use(i32 %conv12)
+ ret i32 %add19
+}
+
+define i32 @mul_ladder4_use_u0(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_use_u0(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV13]], [[SHR8]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL1]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[ADD]])
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add i32 %shr14, %shr17
+ %add18 = add i32 %add16, %shr15
+ %add19 = add i32 %mulhh, %add18
+ call void (...) @llvm.fake.use(i32 %add)
+ ret i32 %add19
+}
+
+define i32 @mul_ladder4_use_hl(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_use_hl(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MULHL]])
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add i32 %shr14, %shr17
+ %add18 = add i32 %add16, %shr15
+ %add19 = add i32 %mulhh, %add18
+ call void (...) @llvm.fake.use(i32 %mulhl)
+ ret i32 %add19
+}
+
+define i32 @mul_ladder4_use_lh(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_use_lh(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]]
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MULLH]])
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add i32 %shr14, %shr17
+ %add18 = add i32 %add16, %shr15
+ %add19 = add i32 %mulhh, %add18
+ call void (...) @llvm.fake.use(i32 %mullh)
+ ret i32 %add19
+}
+
+define i32 @mul_ladder4_use_conv10(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @mul_ladder4_use_conv10(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
+; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
+; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
+; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
+; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YH]], [[XL]]
+; CHECK-NEXT: [[MULHL1:%.*]] = mul nuw i32 [[YL]], [[XH]]
+; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
+; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
+; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
+; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV12]], [[SHR8]]
+; CHECK-NEXT: [[CONV13:%.*]] = and i32 [[MULHL1]], 65535
+; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV13]], [[ADD]]
+; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
+; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16
+; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL1]], 16
+; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
+; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
+; CHECK-NEXT: [[ADD19:%.*]] = add i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[CONV12]])
+; CHECK-NEXT: ret i32 [[ADD19]]
+;
+entry:
+ %xl = and i32 %x, 65535
+ %xh = lshr i32 %x, 16
+ %yl = and i32 %y, 65535
+ %yh = lshr i32 %y, 16
+ %mulll = mul nuw i32 %yl, %xl
+ %mullh = mul nuw i32 %yh, %xl
+ %mulhl = mul nuw i32 %yl, %xh
+ %mulhh = mul nuw i32 %yh, %xh
+ %shr8 = lshr i32 %mulll, 16
+ %conv10 = and i32 %mullh, 65535
+ %add = add nuw nsw i32 %conv10, %shr8
+ %conv12 = and i32 %mulhl, 65535
+ %add13 = add nuw nsw i32 %conv12, %add
+ %shr14 = lshr i32 %add13, 16
+ %shr15 = lshr i32 %mullh, 16
+ %shr17 = lshr i32 %mulhl, 16
+ %add16 = add i32 %shr14, %shr17
+ %add18 = add i32 %add16, %shr15
+ %add19 = add i32 %mulhh, %add18
+ call void (...) @llvm.fake.use(i32 %conv10)
+ ret i32 %add19
+}
>From 380f3792ca4ab6abc31321cc6ebe6d36bad3c60f Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 17 Nov 2025 16:11:48 +0000
Subject: [PATCH 2/4] [AggressiveInstCombine] Match long high-half multiply
This patch adds recognition of high-half multiply by parts into a single larger
multiply.
Considering a multiply made up of high and low parts, we can split the
multiply into:
x * y == (xh*T + xl) * (yh*T + yl)
where xh == x>>32 and xl == x & 0xffffffff. T = 2^32.
This expands to
xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl
which I find it helpful to be drawn as
[ xh*yh ]
[ xh*yl ]
[ xl*yh ]
[ xl*yl ]
We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 +
carrys. The carry makes this difficult and there are multiple ways of
representing it. The ones we attempt to support here are:
Carry: xh*yh + carry + lowsum
carry = lowsum < xh*yl ? 0x1000000 : 0
lowsum = xh*yl + xl*yh + (xl*yl>>32)
Ladder: xh*yh + c2>>32 + c3>>32
c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh
Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
crosssum = xh*yl + xl*yh
carry = crosssum < xh*yl ? 0x1000000 : 0
Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32;
low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
They all start by matching xh*yh + 2 or 3 other operands. The bottom of the
tree is xh*yh, xh*yl, xl*yh and xl*yl.
Based on #156879 by @c-rhodes
---
.../AggressiveInstCombine.cpp | 301 +++++++++++++
.../AggressiveInstCombine/umulh_carry.ll | 140 ++-----
.../AggressiveInstCombine/umulh_carry4.ll | 394 +++++-------------
.../AggressiveInstCombine/umulh_ladder.ll | 176 ++------
.../AggressiveInstCombine/umulh_ladder4.ll | 120 ++----
5 files changed, 499 insertions(+), 632 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index b575d76e897d2..fb71f57eaa502 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1466,6 +1466,306 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
return false;
}
+/// Match high part of long multiplication.
+///
+/// Considering a multiply made up of high and low parts, we can split the
+/// multiply into:
+/// x * y == (xh*T + xl) * (yh*T + yl)
+/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32.
+/// This expands to
+/// xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl
+/// which can be drawn as
+/// [ xh*yh ]
+/// [ xh*yl ]
+/// [ xl*yh ]
+/// [ xl*yl ]
+/// We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 +
+/// some carrys. The carry makes this difficult and there are multiple ways of
+/// representing it. The ones we attempt to support here are:
+/// Carry: xh*yh + carry + lowsum
+/// carry = lowsum < xh*yl ? 0x1000000 : 0
+/// lowsum = xh*yl + xl*yh + (xl*yl>>32)
+/// Ladder: xh*yh + c2>>32 + c3>>32
+/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
+/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
+/// crosssum = xh*yl + xl*yh
+/// carry = crosssum < xh*yl ? 0x1000000 : 0
+/// Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32;
+/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
+///
+/// They all start by matching xh*yh + 2 or 3 other operands. The bottom of the
+/// tree is xh*yh, xh*yl, xl*yh and xl*yl.
+static bool foldMulHigh(Instruction &I) {
+ Type *Ty = I.getType();
+ if (!Ty->isIntOrIntVectorTy())
+ return false;
+
+ unsigned BW = Ty->getScalarSizeInBits();
+ APInt LowMask = APInt::getLowBitsSet(BW, BW / 2);
+ if (BW % 2 != 0)
+ return false;
+
+ auto CreateMulHigh = [&](Value *X, Value *Y) {
+ IRBuilder<> Builder(&I);
+ Type *NTy = Ty->getWithNewBitWidth(BW * 2);
+ Value *XExt = Builder.CreateZExt(X, NTy);
+ Value *YExt = Builder.CreateZExt(Y, NTy);
+ Value *Mul = Builder.CreateMul(XExt, YExt);
+ Value *High = Builder.CreateLShr(Mul, BW);
+ Value *Res = Builder.CreateTrunc(High, Ty);
+ I.replaceAllUsesWith(Res);
+ LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
+ << *Y << "\n");
+ return true;
+ };
+
+ // Common check routines for X_lo*Y_lo and X_hi*Y_lo
+ auto CheckLoLo = [&](Value *XlYl, Value *X, Value *Y) {
+ return match(XlYl, m_c_Mul(m_And(m_Specific(X), m_SpecificInt(LowMask)),
+ m_And(m_Specific(Y), m_SpecificInt(LowMask))));
+ };
+ auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) {
+ return match(XhYl, m_c_Mul(m_LShr(m_Specific(X), m_SpecificInt(BW / 2)),
+ m_And(m_Specific(Y), m_SpecificInt(LowMask))));
+ };
+
+ auto foldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry,
+ Instruction *B) {
+ // Looking for LowSum >> 32 and carry (select)
+ if (Carry->getOpcode() != Instruction::Select)
+ std::swap(Carry, B);
+
+ // Carry = LowSum < XhYl ? 0x100000000 : 0
+ CmpPredicate Pred;
+ Value *LowSum, *XhYl;
+ if (!match(Carry,
+ m_OneUse(m_Select(
+ m_OneUse(m_ICmp(Pred, m_Value(LowSum), m_Value(XhYl))),
+ m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) ||
+ Pred != ICmpInst::ICMP_ULT)
+ return false;
+
+ // XhYl can be Xh*Yl or Xl*Yh
+ if (!CheckHiLo(XhYl, X, Y)) {
+ if (CheckHiLo(XhYl, Y, X))
+ std::swap(X, Y);
+ else
+ return false;
+ }
+ if (XhYl->hasNUsesOrMore(3))
+ return false;
+
+ // B = LowSum >> 16
+ if (!match(B,
+ m_OneUse(m_LShr(m_Specific(LowSum), m_SpecificInt(BW / 2)))) ||
+ LowSum->hasNUsesOrMore(3))
+ return false;
+
+ // LowSum = XhYl + XlYh + XlYl>>32
+ Value *XlYh, *XlYl;
+ auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2));
+ if (!match(LowSum,
+ m_c_Add(m_Specific(XhYl),
+ m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) &&
+ !match(LowSum, m_c_Add(m_OneUse(m_Value(XlYh)),
+ m_OneUse(m_c_Add(m_Specific(XhYl), XlYlHi)))) &&
+ !match(LowSum,
+ m_c_Add(XlYlHi, m_OneUse(m_c_Add(m_Specific(XhYl),
+ m_OneUse(m_Value(XlYh)))))))
+ return false;
+
+ // Check XlYl and XlYh
+ if (!CheckLoLo(XlYl, X, Y))
+ return false;
+ if (!CheckHiLo(XlYh, Y, X))
+ return false;
+
+ return CreateMulHigh(X, Y);
+ };
+
+ auto foldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
+ Instruction *B) {
+ // xh*yh + c2>>32 + c3>>32
+ // c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh
+ Value *XlYh, *XhYl, *C2, *C3;
+ // Strip off the two expected shifts.
+ if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BW / 2))) ||
+ !match(B, m_LShr(m_Value(C3), m_SpecificInt(BW / 2))))
+ return false;
+
+ // Match c3 = c2&0xffffffff + xl*yh
+ if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)),
+ m_Value(XhYl))))
+ std::swap(C2, C3);
+ if (!match(C3,
+ m_c_Add(m_OneUse(m_And(m_Specific(C2), m_SpecificInt(LowMask))),
+ m_Value(XhYl))) ||
+ !C3->hasOneUse() || C2->hasNUsesOrMore(3))
+ return false;
+
+ // Match c2 = xh*yl + (xl*yl >> 32)
+ Value *XlYl;
+ if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)),
+ m_Value(XlYh))))
+ return false;
+
+ // Match XhYl and XlYh - they can appear either way around.
+ if (!CheckHiLo(XlYh, Y, X))
+ std::swap(XlYh, XhYl);
+ if (!CheckHiLo(XlYh, Y, X))
+ return false;
+ if (!CheckHiLo(XhYl, X, Y))
+ return false;
+ if (!CheckLoLo(XlYl, X, Y))
+ return false;
+
+ return CreateMulHigh(X, Y);
+ };
+
+ auto foldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A,
+ Instruction *B, Instruction *C) {
+ /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32;
+ /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
+
+ // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32.
+ auto ShiftAdd = m_LShr(m_Add(m_Value(), m_Value()), m_SpecificInt(BW / 2));
+ if (!match(A, ShiftAdd))
+ std::swap(A, B);
+ if (!match(A, ShiftAdd))
+ std::swap(A, C);
+ Value *Low;
+ if (!match(A, m_LShr(m_OneUse(m_Value(Low)), m_SpecificInt(BW / 2))))
+ return false;
+
+ // Match B == XhYl>>32 and C == XlYh>>32
+ Value *XhYl, *XlYh;
+ if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BW / 2))) ||
+ !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BW / 2))))
+ return false;
+ if (!CheckHiLo(XhYl, X, Y))
+ std::swap(XhYl, XlYh);
+ if (!CheckHiLo(XhYl, X, Y) || XhYl->hasNUsesOrMore(3))
+ return false;
+ if (!CheckHiLo(XlYh, Y, X) || XlYh->hasNUsesOrMore(3))
+ return false;
+
+ // Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff
+ Value *XlYl;
+ if (!match(
+ Low,
+ m_c_Add(
+ m_OneUse(m_c_Add(
+ m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
+ m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))),
+ m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))) &&
+ !match(
+ Low,
+ m_c_Add(
+ m_OneUse(m_c_Add(
+ m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
+ m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))),
+ m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) &&
+ !match(
+ Low,
+ m_c_Add(
+ m_OneUse(m_c_Add(
+ m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))),
+ m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))))),
+ m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))))))
+ return false;
+ if (!CheckLoLo(XlYl, X, Y))
+ return false;
+
+ return CreateMulHigh(X, Y);
+ };
+
+ auto foldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry,
+ Instruction *B, Instruction *C) {
+ // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
+ // crosssum = xh*yl+xl*yh
+ // carry = crosssum < xh*yl ? 0x1000000 : 0
+ if (Carry->getOpcode() != Instruction::Select)
+ std::swap(Carry, B);
+ if (Carry->getOpcode() != Instruction::Select)
+ std::swap(Carry, C);
+
+ // Carry = CrossSum < XhYl ? 0x100000000 : 0
+ CmpPredicate Pred;
+ Value *CrossSum, *XhYl;
+ if (!match(Carry,
+ m_OneUse(m_Select(
+ m_OneUse(m_ICmp(Pred, m_Value(CrossSum), m_Value(XhYl))),
+ m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) ||
+ Pred != ICmpInst::ICMP_ULT)
+ return false;
+
+ if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2))))
+ std::swap(B, C);
+ if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2))))
+ return false;
+
+ Value *XlYl, *LowAccum;
+ if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BW / 2))) ||
+ !match(LowAccum,
+ m_c_Add(m_OneUse(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))),
+ m_OneUse(m_And(m_Specific(CrossSum),
+ m_SpecificInt(LowMask))))) ||
+ LowAccum->hasNUsesOrMore(3))
+ return false;
+ if (!CheckLoLo(XlYl, X, Y))
+ return false;
+
+ if (!CheckHiLo(XhYl, X, Y))
+ std::swap(X, Y);
+ if (!CheckHiLo(XhYl, X, Y))
+ return false;
+ if (!match(CrossSum,
+ m_c_Add(m_Specific(XhYl),
+ m_OneUse(m_c_Mul(
+ m_LShr(m_Specific(Y), m_SpecificInt(BW / 2)),
+ m_And(m_Specific(X), m_SpecificInt(LowMask)))))) ||
+ CrossSum->hasNUsesOrMore(4) || XhYl->hasNUsesOrMore(3))
+ return false;
+
+ return CreateMulHigh(X, Y);
+ };
+
+ // X and Y are the two inputs, A, B and C are other parts of the pattern
+ // (crosssum>>32, carry, etc).
+ Value *X, *Y;
+ Instruction *A, *B, *C;
+ auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BW / 2)),
+ m_LShr(m_Value(Y), m_SpecificInt(BW / 2))));
+ if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A),
+ m_Instruction(B))))) ||
+ match(&I, m_c_Add(m_Instruction(A),
+ m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) &&
+ A->hasOneUse() && B->hasOneUse())
+ if (foldMulHighCarry(X, Y, A, B) || foldMulHighLadder(X, Y, A, B))
+ return true;
+
+ if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add(
+ m_Instruction(A),
+ m_OneUse(m_Add(m_Instruction(B),
+ m_Instruction(C))))))) ||
+ match(&I, m_c_Add(m_Instruction(A),
+ m_OneUse(m_c_Add(
+ HiHi, m_OneUse(m_Add(m_Instruction(B),
+ m_Instruction(C))))))) ||
+ match(&I, m_c_Add(m_Instruction(A),
+ m_OneUse(m_c_Add(
+ m_Instruction(B),
+ m_OneUse(m_c_Add(HiHi, m_Instruction(C))))))) ||
+ match(&I,
+ m_c_Add(m_OneUse(m_c_Add(HiHi, m_Instruction(A))),
+ m_OneUse(m_Add(m_Instruction(B), m_Instruction(C)))))) &&
+ A->hasOneUse() && B->hasOneUse() && C->hasOneUse())
+ return foldMulHighCarry4(X, Y, A, B, C) ||
+ foldMulHighLadder4(X, Y, A, B, C);
+
+ return false;
+}
+
/// This is the entry point for folds that could be implemented in regular
/// InstCombine, but they are separated because they are not expected to
/// occur frequently and/or have more than a constant-length pattern match.
@@ -1495,6 +1795,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
+ MadeChange |= foldMulHigh(I);
// NOTE: This function introduces erasing of the instruction `I`, so it
// needs to be called at the end of this sequence, otherwise we may make
// bugs.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll
index b9801370028cc..b78095cac0df9 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry.ll
@@ -6,22 +6,11 @@ define i32 @mul_carry(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_carry(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
-; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
-; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
-; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
-; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
-; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[ADD11]]
;
entry:
@@ -49,22 +38,11 @@ define i128 @mul_carry_i128(i128 %x, i128 %y) {
; CHECK-LABEL: define i128 @mul_carry_i128(
; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr i128 [[X]], 64
-; CHECK-NEXT: [[AND:%.*]] = and i128 [[X]], 18446744073709551615
-; CHECK-NEXT: [[SHR1:%.*]] = lshr i128 [[Y]], 64
-; CHECK-NEXT: [[AND2:%.*]] = and i128 [[Y]], 18446744073709551615
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw i128 [[SHR]], [[AND2]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i128 [[AND]], [[SHR1]]
-; CHECK-NEXT: [[ADD:%.*]] = add i128 [[MUL]], [[MUL3]]
-; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i128 [[AND]], [[AND2]]
-; CHECK-NEXT: [[SHR5:%.*]] = lshr i128 [[MUL4]], 64
-; CHECK-NEXT: [[ADD6:%.*]] = add i128 [[ADD]], [[SHR5]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i128 [[ADD6]], [[MUL]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i128 18446744073709551616, i128 0
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i128 [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = add nuw i128 [[MUL8]], [[COND]]
-; CHECK-NEXT: [[SHR10:%.*]] = lshr i128 [[ADD6]], 64
-; CHECK-NEXT: [[ADD11:%.*]] = add i128 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i128 [[X]] to i256
+; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i256 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i256 [[TMP2]], 128
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i256 [[TMP3]] to i128
; CHECK-NEXT: ret i128 [[ADD11]]
;
entry:
@@ -92,22 +70,11 @@ define <4 x i32> @mul_carry_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: define <4 x i32> @mul_carry_v4i32(
; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr <4 x i32> [[X]], splat (i32 16)
-; CHECK-NEXT: [[AND:%.*]] = and <4 x i32> [[X]], splat (i32 65535)
-; CHECK-NEXT: [[SHR1:%.*]] = lshr <4 x i32> [[Y]], splat (i32 16)
-; CHECK-NEXT: [[AND2:%.*]] = and <4 x i32> [[Y]], splat (i32 65535)
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw <4 x i32> [[SHR]], [[AND2]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw <4 x i32> [[AND]], [[SHR1]]
-; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[MUL]], [[MUL3]]
-; CHECK-NEXT: [[MUL4:%.*]] = mul nuw <4 x i32> [[AND]], [[AND2]]
-; CHECK-NEXT: [[SHR5:%.*]] = lshr <4 x i32> [[MUL4]], splat (i32 16)
-; CHECK-NEXT: [[ADD6:%.*]] = add <4 x i32> [[ADD]], [[SHR5]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult <4 x i32> [[ADD6]], [[MUL]]
-; CHECK-NEXT: [[COND:%.*]] = select <4 x i1> [[CMP]], <4 x i32> splat (i32 65536), <4 x i32> zeroinitializer
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw <4 x i32> [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = add nuw <4 x i32> [[MUL8]], [[COND]]
-; CHECK-NEXT: [[SHR10:%.*]] = lshr <4 x i32> [[ADD6]], splat (i32 16)
-; CHECK-NEXT: [[ADD11:%.*]] = add <4 x i32> [[ADD9]], [[SHR10]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i32> [[X]] to <4 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[Y]] to <4 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw <4 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], splat (i64 32)
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw <4 x i64> [[TMP3]] to <4 x i32>
; CHECK-NEXT: ret <4 x i32> [[ADD11]]
;
entry:
@@ -135,22 +102,11 @@ define i32 @mul_carry_xlyh(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_carry_xlyh(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
-; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
-; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
-; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL3]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
-; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
-; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[ADD11]]
;
entry:
@@ -177,22 +133,11 @@ define i32 @mul_carry_comm(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_carry_comm(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[AND2]], [[SHR]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[SHR1]], [[AND]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL3]], [[MUL]]
-; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
-; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
-; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[SHR5]], [[ADD]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
-; CHECK-NEXT: [[ADD9:%.*]] = or disjoint i32 [[COND]], [[SHR10]]
-; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[MUL8]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[ADD11]]
;
entry:
@@ -520,22 +465,15 @@ define i32 @mul_carry_use_llh(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_carry_use_llh(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
; CHECK-NEXT: [[ADD6:%.*]] = mul nuw i32 [[AND]], [[AND2]]
; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
-; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD]], [[SHR10]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD7]], [[MUL]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
-; CHECK-NEXT: [[SHR11:%.*]] = lshr i32 [[ADD7]], 16
-; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR11]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[SHR10]])
; CHECK-NEXT: ret i32 [[ADD11]]
;
@@ -564,22 +502,14 @@ define i32 @mul_carry_use_mulll(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_carry_use_mulll(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 16
; CHECK-NEXT: [[AND:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[Y]], 16
; CHECK-NEXT: [[AND2:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[MUL:%.*]] = mul nuw i32 [[SHR]], [[AND2]]
-; CHECK-NEXT: [[MUL3:%.*]] = mul nuw i32 [[AND]], [[SHR1]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[MUL3]]
; CHECK-NEXT: [[MUL4:%.*]] = mul nuw i32 [[AND]], [[AND2]]
-; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[MUL4]], 16
-; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD]], [[SHR5]]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD6]], [[MUL]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], i32 65536, i32 0
-; CHECK-NEXT: [[MUL8:%.*]] = mul nuw i32 [[SHR]], [[SHR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = add nuw i32 [[MUL8]], [[COND]]
-; CHECK-NEXT: [[SHR10:%.*]] = lshr i32 [[ADD6]], 16
-; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD9]], [[SHR10]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD11:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[MUL4]])
; CHECK-NEXT: ret i32 [[ADD11]]
;
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll
index d92434a7a7ea5..fa21721f17762 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_carry4.ll
@@ -5,25 +5,11 @@
define i64 @umulh(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
; Extract low and high 32 bits
@@ -70,25 +56,11 @@ define i64 @umulh(i64 %x, i64 %y) {
define i64 @umulh__commuted(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__commuted(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_LO_X_HI]], [[Y_HI_X_LO]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[Y_LO_X_LO_HI]], [[CROSS_SUM_LO]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[CARRY]], [[INTERMEDIATE]]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[LOW_ACCUM_HI]], [[INTERMEDIATE_PLUS_CARRY]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
; CHECK-NEXT: ret i64 [[TMP4]]
;
; Extract low and high 32 bits
@@ -132,25 +104,11 @@ define i32 @mulh_src32(i32 %x, i32 %y) {
; Extract low and high 16 bits
; CHECK-LABEL: define i32 @mulh_src32(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i32 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i32 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i32 65536, i32 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i32 [[Y_LO_X_LO]], 16
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i32 [[CROSS_SUM]], 65535
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i32 [[CROSS_SUM]], 16
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i32 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i32 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i32 [[LOW_ACCUM]], 16
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i32 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32
+; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i64 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[TMP5]]
;
%x_lo = and i32 %x, u0xffff ; x & 0xffffffff
@@ -193,25 +151,11 @@ define i128 @mulh_src128(i128 %x, i128 %y) {
; Extract low and high 64 bits
; CHECK-LABEL: define i128 @mulh_src128(
; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615
-; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i128 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i128 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i128 18446744073709551616, i128 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i128 [[Y_LO_X_LO]], 64
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i128 [[CROSS_SUM]], 18446744073709551615
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i128 [[CROSS_SUM]], 64
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i128 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i128 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i128 [[LOW_ACCUM]], 64
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i128 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[HW64:%.*]] = add i128 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[X]] to i256
+; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[Y]] to i256
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128
; CHECK-NEXT: ret i128 [[HW64]]
;
%x_lo = and i128 %x, u0xffffffffffffffff ; x & 0xffffffff
@@ -254,25 +198,11 @@ define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) {
; Extract low and high 16 bits
; CHECK-LABEL: define <2 x i32> @mulh_v2i32(
; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535)
-; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535)
-; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16)
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16)
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add <2 x i32> [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult <2 x i32> [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select <2 x i1> [[CARRY_OUT]], <2 x i32> splat (i32 65536), <2 x i32> zeroinitializer
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr <2 x i32> [[Y_LO_X_LO]], splat (i32 16)
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and <2 x i32> [[CROSS_SUM]], splat (i32 65535)
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr <2 x i32> [[CROSS_SUM]], splat (i32 16)
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw <2 x i32> [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw <2 x i32> [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr <2 x i32> [[LOW_ACCUM]], splat (i32 16)
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add <2 x i32> [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[HW64:%.*]] = add <2 x i32> [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[X]] to <2 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[Y]] to <2 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32)
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32>
; CHECK-NEXT: ret <2 x i32> [[HW64]]
;
%x_lo = and <2 x i32> %x, <i32 u0xffff, i32 u0xffff>
@@ -315,30 +245,14 @@ define <2 x i32> @mulh_v2i32(<2 x i32> %x, <2 x i32> %y) {
define void @full_mul_int128(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP4:%.*]] = trunc nuw i128 [[TMP5]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP4]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[TMP8:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[TMP8]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -831,24 +745,11 @@ define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]])
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
@@ -893,26 +794,13 @@ define i64 @umulh__mul_use__x_lo(i64 %x, i64 %y) {
define i64 @umulh__mul_use__y_hi(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh__mul_use__y_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]])
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[HW64:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
; Extract low and high 32 bits
@@ -1154,24 +1042,13 @@ define i64 @umulh__mul_use__y_lo_x_lo(i64 %x, i64 %y) {
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[TMP5]]
;
; Extract low and high 32 bits
@@ -1607,22 +1484,19 @@ define i64 @umulh__mul_use__low_accum(i64 %x, i64 %y) {
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul i64 [[Y]], [[X_HI]]
+; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul i64 [[Y_HI]], [[X]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM]])
-; CHECK-NEXT: [[INTERMEDIATE:%.*]] = add nuw i64 [[CROSS_SUM_HI]], [[Y_HI_X_HI]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[INTERMEDIATE_PLUS_CARRY:%.*]] = add i64 [[INTERMEDIATE]], [[CARRY]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INTERMEDIATE_PLUS_CARRY]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[TMP5]]
;
; Extract low and high 32 bits
@@ -1862,29 +1736,14 @@ define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_LO]])
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -1932,31 +1791,16 @@ define void @full_mul_int128__mul_use__x_lo(i64 %x, i64 %y, ptr %p) {
define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_lo(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO]])
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -2004,31 +1848,16 @@ define void @full_mul_int128__mul_use__y_lo(i64 %x, i64 %y, ptr %p) {
define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__x_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[X_HI]])
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -2076,31 +1905,16 @@ define void @full_mul_int128__mul_use__x_hi(i64 %x, i64 %y, ptr %p) {
define void @full_mul_int128__mul_use__y_hi(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__y_hi(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_HI]])
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[HW64:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[HW64]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -2369,27 +2183,20 @@ define void @full_mul_int128__mul_use__y_lo_x_lo(i64 %x, i64 %y, ptr %p) {
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
+; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = mul i64 [[Y]], [[X_HI]]
+; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = mul i64 [[Y_HI]], [[X]]
; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[Y_LO_X_LO]])
; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[TMP6]], [[LOW_ACCUM_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[TMP6]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[TMP6]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI1:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS1:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS1]], [[LOW_ACCUM_HI1]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
+; CHECK-NEXT: [[LOW_ACCUM1:%.*]] = shl i64 [[TMP6]], 32
+; CHECK-NEXT: [[LW64:%.*]] = add i64 [[Y_LO_X_LO]], [[LOW_ACCUM1]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
@@ -3157,31 +2964,16 @@ define void @full_mul_int128__mul_use__upper_mid_with_cross(i64 %x, i64 %y, ptr
define void @full_mul_int128__mul_use__low_accum_shifted(i64 %x, i64 %y, ptr %p) {
; CHECK-LABEL: define void @full_mul_int128__mul_use__low_accum_shifted(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]], ptr [[P:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[Y_LO_X_HI:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_HI:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[Y_HI_X_LO:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[Y_LO_X_LO:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[CROSS_SUM:%.*]] = add i64 [[Y_HI_X_LO]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY_OUT:%.*]] = icmp ult i64 [[CROSS_SUM]], [[Y_LO_X_HI]]
-; CHECK-NEXT: [[CARRY:%.*]] = select i1 [[CARRY_OUT]], i64 4294967296, i64 0
-; CHECK-NEXT: [[Y_LO_X_LO_HI:%.*]] = lshr i64 [[Y_LO_X_LO]], 32
-; CHECK-NEXT: [[CROSS_SUM_LO:%.*]] = and i64 [[CROSS_SUM]], 4294967295
-; CHECK-NEXT: [[CROSS_SUM_HI:%.*]] = lshr i64 [[CROSS_SUM]], 32
-; CHECK-NEXT: [[LOW_ACCUM:%.*]] = add nuw nsw i64 [[CROSS_SUM_LO]], [[Y_LO_X_LO_HI]]
-; CHECK-NEXT: [[UPPER_MID:%.*]] = add nuw i64 [[Y_HI_X_HI]], [[CARRY]]
-; CHECK-NEXT: [[LOW_ACCUM_HI:%.*]] = lshr i64 [[LOW_ACCUM]], 32
-; CHECK-NEXT: [[UPPER_MID_WITH_CROSS:%.*]] = add i64 [[UPPER_MID]], [[CROSS_SUM_HI]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[UPPER_MID_WITH_CROSS]], [[LOW_ACCUM_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: [[HI_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8
; CHECK-NEXT: store i64 [[TMP5]], ptr [[HI_PTR]], align 8
-; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = shl i64 [[LOW_ACCUM]], 32
+; CHECK-NEXT: [[LW64:%.*]] = mul i64 [[X]], [[Y]]
+; CHECK-NEXT: [[LOW_ACCUM_SHIFTED:%.*]] = and i64 [[LW64]], -4294967296
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[LOW_ACCUM_SHIFTED]])
-; CHECK-NEXT: [[Y_LO_X_LO_LO:%.*]] = and i64 [[Y_LO_X_LO]], 4294967295
-; CHECK-NEXT: [[LW64:%.*]] = or disjoint i64 [[LOW_ACCUM_SHIFTED]], [[Y_LO_X_LO_LO]]
; CHECK-NEXT: store i64 [[LW64]], ptr [[P]], align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
index 6e56eb86516c5..745c61923d0f8 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
@@ -5,22 +5,11 @@
define i64 @umulh_variant(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_variant(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
-; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
-; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
-; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[TMP5:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[TMP5]]
;
%x_lo = and i64 %x, 4294967295
@@ -48,22 +37,11 @@ define i64 @umulh_variant(i64 %x, i64 %y) {
define i32 @umulh_variant_i32(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @umulh_variant_i32(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[Y_LO:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[T0:%.*]] = mul nuw i32 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i32 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i32 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i32 [[T0]], 16
-; CHECK-NEXT: [[U0:%.*]] = add nuw i32 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i32 [[U0]], 65535
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i32 [[U0]], 16
-; CHECK-NEXT: [[U1:%.*]] = add nuw i32 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i32 [[U1]], 16
-; CHECK-NEXT: [[U2:%.*]] = add nuw i32 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i32 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 32
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i64 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[HW64]]
;
%x_lo = and i32 %x, u0xffff
@@ -91,22 +69,11 @@ define i32 @umulh_variant_i32(i32 %x, i32 %y) {
define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: define <2 x i32> @umulh_variant_v2i32(
; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and <2 x i32> [[X]], splat (i32 65535)
-; CHECK-NEXT: [[Y_LO:%.*]] = and <2 x i32> [[Y]], splat (i32 65535)
-; CHECK-NEXT: [[X_HI:%.*]] = lshr <2 x i32> [[X]], splat (i32 16)
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16)
-; CHECK-NEXT: [[T0:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw <2 x i32> [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw <2 x i32> [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr <2 x i32> [[T0]], splat (i32 16)
-; CHECK-NEXT: [[U0:%.*]] = add nuw <2 x i32> [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and <2 x i32> [[U0]], splat (i32 65535)
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr <2 x i32> [[U0]], splat (i32 16)
-; CHECK-NEXT: [[U1:%.*]] = add nuw <2 x i32> [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr <2 x i32> [[U1]], splat (i32 16)
-; CHECK-NEXT: [[U2:%.*]] = add nuw <2 x i32> [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw <2 x i32> [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[Y]] to <2 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[X]] to <2 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 32)
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw <2 x i64> [[TMP4]] to <2 x i32>
; CHECK-NEXT: ret <2 x i32> [[HW64]]
;
%x_lo = and <2 x i32> %x, <i32 u0xffff, i32 u0xffff>
@@ -134,22 +101,11 @@ define <2 x i32> @umulh_variant_v2i32(<2 x i32> %x, <2 x i32> %y) {
define i128 @umulh_variant_i128(i128 %x, i128 %y) {
; CHECK-LABEL: define i128 @umulh_variant_i128(
; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i128 [[X]], 18446744073709551615
-; CHECK-NEXT: [[Y_LO:%.*]] = and i128 [[Y]], 18446744073709551615
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i128 [[X]], 64
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i128 [[Y]], 64
-; CHECK-NEXT: [[T0:%.*]] = mul nuw i128 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i128 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i128 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i128 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i128 [[T0]], 64
-; CHECK-NEXT: [[U0:%.*]] = add nuw i128 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i128 [[U0]], 18446744073709551615
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i128 [[U0]], 64
-; CHECK-NEXT: [[U1:%.*]] = add nuw i128 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i128 [[U1]], 64
-; CHECK-NEXT: [[U2:%.*]] = add nuw i128 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i128 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256
+; CHECK-NEXT: [[TMP2:%.*]] = zext i128 [[X]] to i256
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i256 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i256 [[TMP3]], 128
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i256 [[TMP4]] to i128
; CHECK-NEXT: ret i128 [[HW64]]
;
%x_lo = and i128 %x, u0xffffffffffffffff
@@ -177,22 +133,11 @@ define i128 @umulh_variant_i128(i128 %x, i128 %y) {
define i64 @umulh_variant_commuted(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_variant_commuted(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[X_LO]], [[Y_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[X_LO]], [[Y_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[X_HI]], [[Y_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[X_HI]], [[Y_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
-; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T1]], [[T0_HI]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
-; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[T2]], [[U0_LO]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
-; CHECK-NEXT: [[U2:%.*]] = add nuw nsw i64 [[U1_HI]], [[U0_HI]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[T3]], [[U2]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
%x_lo = and i64 %x, 4294967295
@@ -403,21 +348,13 @@ define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) {
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0]])
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
-; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
-; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
-; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
%x_lo = and i64 %x, 4294967295
@@ -447,23 +384,15 @@ define i64 @umulh_variant__mul_use__t0(i64 %x, i64 %y) {
define i64 @umulh_variant__mul_use__t1(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_variant__mul_use__t1(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
-; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T1]])
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
-; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
-; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
-; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
%x_lo = and i64 %x, 4294967295
@@ -494,22 +423,14 @@ define i64 @umulh_variant__mul_use__t2(i64 %x, i64 %y) {
; CHECK-LABEL: define i64 @umulh_variant__mul_use__t2(
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
-; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
-; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T2]])
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
-; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
-; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
-; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
-; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
%x_lo = and i64 %x, 4294967295
@@ -587,21 +508,14 @@ define i64 @umulh_variant__mul_use__t0_hi(i64 %x, i64 %y) {
; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) {
; CHECK-NEXT: [[X_LO:%.*]] = and i64 [[X]], 4294967295
; CHECK-NEXT: [[Y_LO:%.*]] = and i64 [[Y]], 4294967295
-; CHECK-NEXT: [[X_HI:%.*]] = lshr i64 [[X]], 32
-; CHECK-NEXT: [[Y_HI:%.*]] = lshr i64 [[Y]], 32
; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[Y_LO]], [[X_LO]]
-; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[Y_LO]], [[X_HI]]
-; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[Y_HI]], [[X_LO]]
-; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[Y_HI]], [[X_HI]]
; CHECK-NEXT: [[T0_HI:%.*]] = lshr i64 [[T0]], 32
; CHECK-NEXT: call void (...) @llvm.fake.use(i64 [[T0_HI]])
-; CHECK-NEXT: [[U0:%.*]] = add nuw i64 [[T0_HI]], [[T1]]
-; CHECK-NEXT: [[U0_LO:%.*]] = and i64 [[U0]], 4294967295
-; CHECK-NEXT: [[U0_HI:%.*]] = lshr i64 [[U0]], 32
-; CHECK-NEXT: [[U1:%.*]] = add nuw i64 [[U0_LO]], [[T2]]
-; CHECK-NEXT: [[U1_HI:%.*]] = lshr i64 [[U1]], 32
-; CHECK-NEXT: [[U2:%.*]] = add nuw i64 [[U0_HI]], [[T3]]
-; CHECK-NEXT: [[HW64:%.*]] = add nuw i64 [[U2]], [[U1_HI]]
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[Y]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[X]] to i128
+; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = lshr i128 [[TMP3]], 64
+; CHECK-NEXT: [[HW64:%.*]] = trunc nuw i128 [[TMP4]] to i64
; CHECK-NEXT: ret i64 [[HW64]]
;
%x_lo = and i64 %x, 4294967295
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll
index 5f84bc4e93b82..307fc62a6b4ba 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder4.ll
@@ -6,25 +6,11 @@ define i32 @mul_ladder4(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_ladder4(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]]
-; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]]
-; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]]
-; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]]
-; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
-; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]]
-; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
-; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]]
-; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
-; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
-; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]]
-; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
-; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]]
-; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[ADD19]]
;
entry:
@@ -54,25 +40,11 @@ define <2 x i32> @mul_ladder4_v2i32(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: define <2 x i32> @mul_ladder4_v2i32(
; CHECK-SAME: <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[XL:%.*]] = and <2 x i32> [[X]], splat (i32 65535)
-; CHECK-NEXT: [[XH:%.*]] = lshr <2 x i32> [[X]], splat (i32 16)
-; CHECK-NEXT: [[YL:%.*]] = and <2 x i32> [[Y]], splat (i32 65535)
-; CHECK-NEXT: [[YH:%.*]] = lshr <2 x i32> [[Y]], splat (i32 16)
-; CHECK-NEXT: [[MULLL:%.*]] = mul nuw <2 x i32> [[XL]], [[YL]]
-; CHECK-NEXT: [[MULLH:%.*]] = mul nuw <2 x i32> [[XL]], [[YH]]
-; CHECK-NEXT: [[MULHL:%.*]] = mul nuw <2 x i32> [[XH]], [[YL]]
-; CHECK-NEXT: [[MULHH:%.*]] = mul nuw <2 x i32> [[XH]], [[YH]]
-; CHECK-NEXT: [[SHR8:%.*]] = lshr <2 x i32> [[MULLL]], splat (i32 16)
-; CHECK-NEXT: [[CONV10:%.*]] = and <2 x i32> [[MULLH]], splat (i32 65535)
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw <2 x i32> [[SHR8]], [[CONV10]]
-; CHECK-NEXT: [[CONV12:%.*]] = and <2 x i32> [[MULHL]], splat (i32 65535)
-; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw <2 x i32> [[ADD]], [[CONV12]]
-; CHECK-NEXT: [[SHR14:%.*]] = lshr <2 x i32> [[ADD13]], splat (i32 16)
-; CHECK-NEXT: [[SHR15:%.*]] = lshr <2 x i32> [[MULLH]], splat (i32 16)
-; CHECK-NEXT: [[ADD16:%.*]] = add nuw <2 x i32> [[MULHH]], [[SHR15]]
-; CHECK-NEXT: [[SHR17:%.*]] = lshr <2 x i32> [[MULHL]], splat (i32 16)
-; CHECK-NEXT: [[ADD18:%.*]] = add nuw <2 x i32> [[ADD16]], [[SHR17]]
-; CHECK-NEXT: [[ADD19:%.*]] = add nuw <2 x i32> [[ADD18]], [[SHR14]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[X]] to <2 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[Y]] to <2 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw <2 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32)
+; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw <2 x i64> [[TMP3]] to <2 x i32>
; CHECK-NEXT: ret <2 x i32> [[ADD19]]
;
entry:
@@ -102,25 +74,11 @@ define i128 @mul_ladder4_i128(i128 %x, i128 %y) {
; CHECK-LABEL: define i128 @mul_ladder4_i128(
; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[XL:%.*]] = and i128 [[X]], 18446744073709551615
-; CHECK-NEXT: [[XH:%.*]] = lshr i128 [[X]], 64
-; CHECK-NEXT: [[YL:%.*]] = and i128 [[Y]], 18446744073709551615
-; CHECK-NEXT: [[YH:%.*]] = lshr i128 [[Y]], 64
-; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i128 [[XL]], [[YL]]
-; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i128 [[XL]], [[YH]]
-; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i128 [[XH]], [[YL]]
-; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i128 [[XH]], [[YH]]
-; CHECK-NEXT: [[SHR8:%.*]] = lshr i128 [[MULLL]], 64
-; CHECK-NEXT: [[CONV10:%.*]] = and i128 [[MULLH]], 18446744073709551615
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i128 [[SHR8]], [[CONV10]]
-; CHECK-NEXT: [[CONV12:%.*]] = and i128 [[MULHL]], 18446744073709551615
-; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i128 [[ADD]], [[CONV12]]
-; CHECK-NEXT: [[SHR14:%.*]] = lshr i128 [[ADD13]], 64
-; CHECK-NEXT: [[SHR15:%.*]] = lshr i128 [[MULLH]], 64
-; CHECK-NEXT: [[ADD16:%.*]] = add nuw i128 [[MULHH]], [[SHR15]]
-; CHECK-NEXT: [[SHR17:%.*]] = lshr i128 [[MULHL]], 64
-; CHECK-NEXT: [[ADD18:%.*]] = add nuw i128 [[ADD16]], [[SHR17]]
-; CHECK-NEXT: [[ADD19:%.*]] = add nuw i128 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i128 [[X]] to i256
+; CHECK-NEXT: [[TMP1:%.*]] = zext i128 [[Y]] to i256
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i256 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i256 [[TMP2]], 128
+; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i256 [[TMP3]] to i128
; CHECK-NEXT: ret i128 [[ADD19]]
;
entry:
@@ -150,25 +108,11 @@ define i32 @mul_ladder4_commutted(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_ladder4_commutted(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[YL]], [[XL]]
-; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[YH]], [[XL]]
-; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[YL]], [[XH]]
-; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[YH]], [[XH]]
-; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
-; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULLH]], 65535
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV10]], [[SHR8]]
-; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULHL]], 65535
-; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[CONV12]], [[ADD]]
-; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
-; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULLH]], 16
-; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULHL]], 16
-; CHECK-NEXT: [[ADD16:%.*]] = add nuw nsw i32 [[SHR14]], [[SHR17]]
-; CHECK-NEXT: [[ADD18:%.*]] = add nuw nsw i32 [[ADD16]], [[SHR15]]
-; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[MULHH]], [[ADD18]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[ADD19]]
;
entry:
@@ -198,25 +142,11 @@ define i32 @mul_ladder4_swap_hl_lh(i32 %x, i32 %y) {
; CHECK-LABEL: define i32 @mul_ladder4_swap_hl_lh(
; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[XL:%.*]] = and i32 [[X]], 65535
-; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y]], 65535
-; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16
-; CHECK-NEXT: [[MULLL:%.*]] = mul nuw i32 [[XL]], [[YL]]
-; CHECK-NEXT: [[MULLH:%.*]] = mul nuw i32 [[XL]], [[YH]]
-; CHECK-NEXT: [[MULHL:%.*]] = mul nuw i32 [[XH]], [[YL]]
-; CHECK-NEXT: [[MULHH:%.*]] = mul nuw i32 [[XH]], [[YH]]
-; CHECK-NEXT: [[SHR8:%.*]] = lshr i32 [[MULLL]], 16
-; CHECK-NEXT: [[CONV10:%.*]] = and i32 [[MULHL]], 65535
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[SHR8]], [[CONV10]]
-; CHECK-NEXT: [[CONV12:%.*]] = and i32 [[MULLH]], 65535
-; CHECK-NEXT: [[ADD13:%.*]] = add nuw nsw i32 [[ADD]], [[CONV12]]
-; CHECK-NEXT: [[SHR14:%.*]] = lshr i32 [[ADD13]], 16
-; CHECK-NEXT: [[SHR15:%.*]] = lshr i32 [[MULHL]], 16
-; CHECK-NEXT: [[ADD16:%.*]] = add nuw i32 [[MULHH]], [[SHR15]]
-; CHECK-NEXT: [[SHR17:%.*]] = lshr i32 [[MULLH]], 16
-; CHECK-NEXT: [[ADD18:%.*]] = add nuw i32 [[ADD16]], [[SHR17]]
-; CHECK-NEXT: [[ADD19:%.*]] = add nuw i32 [[ADD18]], [[SHR14]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[Y]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 32
+; CHECK-NEXT: [[ADD19:%.*]] = trunc nuw i64 [[TMP3]] to i32
; CHECK-NEXT: ret i32 [[ADD19]]
;
entry:
>From d4ab4a35f2511a0d725e1ae6362a61ea6fe19cd2 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 19 Nov 2025 08:59:53 +0000
Subject: [PATCH 3/4] Address comments
---
.../AggressiveInstCombine.cpp | 31 ++++++++++---------
1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index fb71f57eaa502..12ee2fe4efd83 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1513,6 +1513,7 @@ static bool foldMulHigh(Instruction &I) {
Value *Mul = Builder.CreateMul(XExt, YExt);
Value *High = Builder.CreateLShr(Mul, BW);
Value *Res = Builder.CreateTrunc(High, Ty);
+ Res->takeName(&I);
I.replaceAllUsesWith(Res);
LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
<< *Y << "\n");
@@ -1529,20 +1530,20 @@ static bool foldMulHigh(Instruction &I) {
m_And(m_Specific(Y), m_SpecificInt(LowMask))));
};
- auto foldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry,
+ auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry,
Instruction *B) {
// Looking for LowSum >> 32 and carry (select)
if (Carry->getOpcode() != Instruction::Select)
std::swap(Carry, B);
// Carry = LowSum < XhYl ? 0x100000000 : 0
- CmpPredicate Pred;
Value *LowSum, *XhYl;
if (!match(Carry,
m_OneUse(m_Select(
- m_OneUse(m_ICmp(Pred, m_Value(LowSum), m_Value(XhYl))),
- m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) ||
- Pred != ICmpInst::ICMP_ULT)
+ m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(LowSum),
+ m_Value(XhYl))),
+ m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)),
+ m_SpecificInt(0)))))
return false;
// XhYl can be Xh*Yl or Xl*Yh
@@ -1583,7 +1584,7 @@ static bool foldMulHigh(Instruction &I) {
return CreateMulHigh(X, Y);
};
- auto foldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
+ auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
Instruction *B) {
// xh*yh + c2>>32 + c3>>32
// c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh
@@ -1622,7 +1623,7 @@ static bool foldMulHigh(Instruction &I) {
return CreateMulHigh(X, Y);
};
- auto foldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A,
+ auto FoldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A,
Instruction *B, Instruction *C) {
/// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32;
/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
@@ -1679,7 +1680,7 @@ static bool foldMulHigh(Instruction &I) {
return CreateMulHigh(X, Y);
};
- auto foldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry,
+ auto FoldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry,
Instruction *B, Instruction *C) {
// xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
// crosssum = xh*yl+xl*yh
@@ -1690,13 +1691,13 @@ static bool foldMulHigh(Instruction &I) {
std::swap(Carry, C);
// Carry = CrossSum < XhYl ? 0x100000000 : 0
- CmpPredicate Pred;
Value *CrossSum, *XhYl;
if (!match(Carry,
m_OneUse(m_Select(
- m_OneUse(m_ICmp(Pred, m_Value(CrossSum), m_Value(XhYl))),
- m_SpecificInt(APInt(BW, 1) << BW / 2), m_SpecificInt(0)))) ||
- Pred != ICmpInst::ICMP_ULT)
+ m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT,
+ m_Value(CrossSum), m_Value(XhYl))),
+ m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)),
+ m_SpecificInt(0)))))
return false;
if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2))))
@@ -1741,7 +1742,7 @@ static bool foldMulHigh(Instruction &I) {
match(&I, m_c_Add(m_Instruction(A),
m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) &&
A->hasOneUse() && B->hasOneUse())
- if (foldMulHighCarry(X, Y, A, B) || foldMulHighLadder(X, Y, A, B))
+ if (FoldMulHighCarry(X, Y, A, B) || FoldMulHighLadder(X, Y, A, B))
return true;
if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add(
@@ -1760,8 +1761,8 @@ static bool foldMulHigh(Instruction &I) {
m_c_Add(m_OneUse(m_c_Add(HiHi, m_Instruction(A))),
m_OneUse(m_Add(m_Instruction(B), m_Instruction(C)))))) &&
A->hasOneUse() && B->hasOneUse() && C->hasOneUse())
- return foldMulHighCarry4(X, Y, A, B, C) ||
- foldMulHighLadder4(X, Y, A, B, C);
+ return FoldMulHighCarry4(X, Y, A, B, C) ||
+ FoldMulHighLadder4(X, Y, A, B, C);
return false;
}
>From f06ebc3cc0c59b1f621afd975aa110613a5e6982 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 20 Nov 2025 08:44:58 +0000
Subject: [PATCH 4/4] Address Comments 2
---
.../AggressiveInstCombine.cpp | 68 +++++++++++--------
.../AggressiveInstCombine/umulh_ladder.ll | 40 +++++++++++
2 files changed, 80 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 12ee2fe4efd83..7e11b863a2869 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1487,6 +1487,7 @@ static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
/// lowsum = xh*yl + xl*yh + (xl*yl>>32)
/// Ladder: xh*yh + c2>>32 + c3>>32
/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
+/// or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xl*yh
/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
/// crosssum = xh*yl + xl*yh
/// carry = crosssum < xh*yl ? 0x1000000 : 0
@@ -1510,9 +1511,9 @@ static bool foldMulHigh(Instruction &I) {
Type *NTy = Ty->getWithNewBitWidth(BW * 2);
Value *XExt = Builder.CreateZExt(X, NTy);
Value *YExt = Builder.CreateZExt(Y, NTy);
- Value *Mul = Builder.CreateMul(XExt, YExt);
+ Value *Mul = Builder.CreateMul(XExt, YExt, "", true);
Value *High = Builder.CreateLShr(Mul, BW);
- Value *Res = Builder.CreateTrunc(High, Ty);
+ Value *Res = Builder.CreateTrunc(High, Ty, "", true);
Res->takeName(&I);
I.replaceAllUsesWith(Res);
LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
@@ -1542,8 +1543,7 @@ static bool foldMulHigh(Instruction &I) {
m_OneUse(m_Select(
m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(LowSum),
m_Value(XhYl))),
- m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)),
- m_SpecificInt(0)))))
+ m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero()))))
return false;
// XhYl can be Xh*Yl or Xl*Yh
@@ -1556,7 +1556,7 @@ static bool foldMulHigh(Instruction &I) {
if (XhYl->hasNUsesOrMore(3))
return false;
- // B = LowSum >> 16
+ // B = LowSum >> 32
if (!match(B,
m_OneUse(m_LShr(m_Specific(LowSum), m_SpecificInt(BW / 2)))) ||
LowSum->hasNUsesOrMore(3))
@@ -1587,28 +1587,43 @@ static bool foldMulHigh(Instruction &I) {
auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
Instruction *B) {
// xh*yh + c2>>32 + c3>>32
- // c2 = xh*yl + (xl*yl >> 32); c3 = c2&0xffffffff + xl*yh
- Value *XlYh, *XhYl, *C2, *C3;
+ // c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
+ // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl
+ Value *XlYh, *XhYl, *XlYl, *C2, *C3;
// Strip off the two expected shifts.
if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BW / 2))) ||
!match(B, m_LShr(m_Value(C3), m_SpecificInt(BW / 2))))
return false;
- // Match c3 = c2&0xffffffff + xl*yh
- if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)),
- m_Value(XhYl))))
+ if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value())))
std::swap(C2, C3);
- if (!match(C3,
- m_c_Add(m_OneUse(m_And(m_Specific(C2), m_SpecificInt(LowMask))),
- m_Value(XhYl))) ||
- !C3->hasOneUse() || C2->hasNUsesOrMore(3))
- return false;
+ // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32)
+ if (match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)),
+ m_Value(XlYh)),
+ m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)))) ||
+ match(C2, m_c_Add(m_c_Add(m_And(m_Specific(C3), m_SpecificInt(LowMask)),
+ m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2))),
+ m_Value(XlYh))) ||
+ match(C2, m_c_Add(m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)),
+ m_Value(XlYh)),
+ m_And(m_Specific(C3), m_SpecificInt(LowMask))))) {
+ XhYl = C3;
+ } else {
+ // Match c3 = c2&0xffffffff + xl*yh
+ if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)),
+ m_Value(XlYh))))
+ std::swap(C2, C3);
+ if (!match(C3, m_c_Add(m_OneUse(
+ m_And(m_Specific(C2), m_SpecificInt(LowMask))),
+ m_Value(XlYh))) ||
+ !C3->hasOneUse() || C2->hasNUsesOrMore(3))
+ return false;
- // Match c2 = xh*yl + (xl*yl >> 32)
- Value *XlYl;
- if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)),
- m_Value(XlYh))))
- return false;
+ // Match c2 = xh*yl + (xl*yl >> 32)
+ if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BW / 2)),
+ m_Value(XhYl))))
+ return false;
+ }
// Match XhYl and XlYh - they can appear either way around.
if (!CheckHiLo(XlYh, Y, X))
@@ -1696,8 +1711,7 @@ static bool foldMulHigh(Instruction &I) {
m_OneUse(m_Select(
m_OneUse(m_SpecificICmp(ICmpInst::ICMP_ULT,
m_Value(CrossSum), m_Value(XhYl))),
- m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)),
- m_SpecificInt(0)))))
+ m_SpecificInt(APInt::getOneBitSet(BW, BW / 2)), m_Zero()))))
return false;
if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BW / 2))))
@@ -1720,12 +1734,10 @@ static bool foldMulHigh(Instruction &I) {
std::swap(X, Y);
if (!CheckHiLo(XhYl, X, Y))
return false;
- if (!match(CrossSum,
- m_c_Add(m_Specific(XhYl),
- m_OneUse(m_c_Mul(
- m_LShr(m_Specific(Y), m_SpecificInt(BW / 2)),
- m_And(m_Specific(X), m_SpecificInt(LowMask)))))) ||
- CrossSum->hasNUsesOrMore(4) || XhYl->hasNUsesOrMore(3))
+ Value *XlYh;
+ if (!match(CrossSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_Value(XlYh)))) ||
+ !CheckHiLo(XlYh, Y, X) || CrossSum->hasNUsesOrMore(4) ||
+ XhYl->hasNUsesOrMore(3))
return false;
return CreateMulHigh(X, Y);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
index 745c61923d0f8..257cc0315c72f 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/umulh_ladder.ll
@@ -816,3 +816,43 @@ define i64 @umulh_variant__mul_use__u2(i64 %x, i64 %y) {
%hw64 = add nuw i64 %u2, %u1_hi
ret i64 %hw64
}
+
+define [2 x i64] @XXH_mult64to128(i64 noundef %lhs, i64 noundef %rhs) {
+; CHECK-LABEL: define [2 x i64] @XXH_mult64to128(
+; CHECK-SAME: i64 noundef [[LHS:%.*]], i64 noundef [[RHS:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[RHS]] to i128
+; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[LHS]] to i128
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw i128 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[TMP2]], 64
+; CHECK-NEXT: [[ADD16:%.*]] = trunc nuw i128 [[TMP3]] to i64
+; CHECK-NEXT: [[SHR102:%.*]] = mul i64 [[LHS]], [[RHS]]
+; CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[SHR102]], 0
+; CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[ADD16]], 1
+; CHECK-NEXT: ret [2 x i64] [[DOTFCA_1_INSERT]]
+;
+entry:
+ %and = and i64 %lhs, 4294967295
+ %and1 = and i64 %rhs, 4294967295
+ %mul.i = mul nuw i64 %and1, %and
+ %shr = lshr i64 %lhs, 32
+ %mul.i27 = mul nuw i64 %and1, %shr
+ %shr5 = lshr i64 %rhs, 32
+ %mul.i28 = mul nuw i64 %shr5, %and
+ %mul.i29 = mul nuw i64 %shr5, %shr
+ %shr10 = lshr i64 %mul.i, 32
+ %and11 = and i64 %mul.i27, 4294967295
+ %add = add nuw i64 %and11, %mul.i28
+ %add12 = add nuw i64 %add, %shr10
+ %shr13 = lshr i64 %mul.i27, 32
+ %shr14 = lshr i64 %add12, 32
+ %add15 = add nuw i64 %shr13, %mul.i29
+ %add16 = add nuw i64 %add15, %shr14
+ %shl = shl i64 %add12, 32
+ %and17 = and i64 %mul.i, 4294967295
+ %or = or disjoint i64 %shl, %and17
+ %.fca.0.insert = insertvalue [2 x i64] poison, i64 %or, 0
+ %.fca.1.insert = insertvalue [2 x i64] %.fca.0.insert, i64 %add16, 1
+ ret [2 x i64] %.fca.1.insert
+}
+
More information about the llvm-commits
mailing list