[llvm] [InstCombine] Add one-use limitation to box multiply fold (PR #72876)

Mon Dec 4 02:34:16 PST 2023

https://github.com/shaojingzhi updated https://github.com/llvm/llvm-project/pull/72876

>From 91c77fc4e6784a1e35f2be5e6e4d5bf7ea950827 Mon Sep 17 00:00:00 2001
From: shaojingzhi <28193696+shaojingzhi at users.noreply.github.com>
Date: Mon, 20 Nov 2023 22:09:43 +0800
Subject: [PATCH 1/5] Update InstCombineAddSub.cpp

Add a situation that mul cannot be replaced by add and shl.
---
 llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 90b1c133461a4..5b82c3179792f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1405,6 +1405,14 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
   // ResLo = (CrossSum << HalfBits) + (YLo * XLo)
   Value *XLo, *YLo;
   Value *CrossSum;
+  
+  // Checking the operands of I is used in no more than one place,
+  // which can not be deleted, cause a mul instruction has far more weight than
+  // add and shl instruction in IR, thus this method cannot achieve the goal of
+  // simplifying instructions, just return null.
+  if ((!I.getOperand(0)->hasOneUser() || !I.getOperand(1)->hasOneUser()))
+    return nullptr;
+
   if (!match(&I, m_c_Add(m_Shl(m_Value(CrossSum), m_SpecificInt(HalfBits)),
                          m_Mul(m_Value(YLo), m_Value(XLo)))))
     return nullptr;

>From c6c87dbab6cba862cd0da4c6bf72b6e24fa0f613 Mon Sep 17 00:00:00 2001
From: shaojingzhi <28193696+shaojingzhi at users.noreply.github.com>
Date: Mon, 20 Nov 2023 22:12:34 +0800
Subject: [PATCH 2/5] Update mul_full_64.ll

---
 llvm/test/Transforms/InstCombine/mul_full_64.ll | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/mul_full_64.ll b/llvm/test/Transforms/InstCombine/mul_full_64.ll
index 8a57b548cd14b..5c57270fb147d 100644
--- a/llvm/test/Transforms/InstCombine/mul_full_64.ll
+++ b/llvm/test/Transforms/InstCombine/mul_full_64.ll
@@ -177,6 +177,7 @@ define i64 @mul_full_64_variant2(i64 %a, i64 %b, ptr nocapture %rhi) {
   ret i64 %add27
 }
 
+; Negative test case for mul_fold function: MUL7 is used in more than one place
 define i64 @mul_full_64_variant3(i64 %a, i64 %b, ptr nocapture %rhi) {
 ; CHECK-LABEL: @mul_full_64_variant3(
 ; CHECK-NEXT:    [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295
@@ -196,7 +197,9 @@ define i64 @mul_full_64_variant3(i64 %a, i64 %b, ptr nocapture %rhi) {
 ; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i64 [[ADD15]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I]]
 ; CHECK-NEXT:    store i64 [[ADD17]], ptr [[RHI:%.*]], align 8
-; CHECK-NEXT:    [[ADD19:%.*]] = mul i64 [[A]], [[B]]
+; CHECK-NEXT:    [[ADD18:%.*]] = add i64 [[MUL6]], [[MUL5]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[ADD18]], 32
+; CHECK-NEXT:    [[ADD19:%.*]] = add i64 [[SHL]], [[MUL7]]
 ; CHECK-NEXT:    ret i64 [[ADD19]]
 ;
   %conv = and i64 %a, 4294967295

>From 76097e23f035d2c3e12fa20ea5edd00f1a43f86f Mon Sep 17 00:00:00 2001
From: shaojingzhi <28193696+shaojingzhi at users.noreply.github.com>
Date: Wed, 22 Nov 2023 15:45:00 +0800
Subject: [PATCH 3/5] Update
 llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp

Co-authored-by: Yingwei Zheng <dtcxzyw at qq.com>
---
 llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 5b82c3179792f..e7e9dfdb7acbb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1410,11 +1410,8 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
   // which can not be deleted, cause a mul instruction has far more weight than
   // add and shl instruction in IR, thus this method cannot achieve the goal of
   // simplifying instructions, just return null.
-  if ((!I.getOperand(0)->hasOneUser() || !I.getOperand(1)->hasOneUser()))
-    return nullptr;
-
   if (!match(&I, m_c_Add(m_Shl(m_Value(CrossSum), m_SpecificInt(HalfBits)),
-                         m_Mul(m_Value(YLo), m_Value(XLo)))))
+                         m_OneUse(m_Mul(m_Value(YLo), m_Value(XLo))))))
     return nullptr;
 
   // XLo = X & HalfMask

>From 512db8aa2cb853f703eb493fa2fd2462c9ef9aef Mon Sep 17 00:00:00 2001
From: shaojingzhi <28193696+shaojingzhi at users.noreply.github.com>
Date: Sat, 25 Nov 2023 17:06:24 +0800
Subject: [PATCH 4/5] Update
 llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp

Modify annotation.

Co-authored-by: Nikita Popov <github at npopov.com>
---
 llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index e7e9dfdb7acbb..7c3401e9b7a67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1406,10 +1406,8 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
   Value *XLo, *YLo;
   Value *CrossSum;
   
-  // Checking the operands of I is used in no more than one place,
-  // which can not be deleted, cause a mul instruction has far more weight than
-  // add and shl instruction in IR, thus this method cannot achieve the goal of
-  // simplifying instructions, just return null.
+  // Require one-use on the multiply to avoid increasing the number of
+  // multiplications.
   if (!match(&I, m_c_Add(m_Shl(m_Value(CrossSum), m_SpecificInt(HalfBits)),
                          m_OneUse(m_Mul(m_Value(YLo), m_Value(XLo))))))
     return nullptr;

>From 50d5689a93e43cba1d5d0af4423f508fa6105164 Mon Sep 17 00:00:00 2001
From: shaojingzhi <shaojingzhi98 at gmail.com>
Date: Mon, 4 Dec 2023 17:45:56 +0800
Subject: [PATCH 5/5] Add test case

Add test case to show shl does not need hasOneUse constraint
---
 .../InstCombine/InstCombineAddSub.cpp         |  1 -
 llvm/test/Transforms/InstCombine/mul_fold.ll  | 27 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 7c3401e9b7a67..e1d65b9bfa061 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1405,7 +1405,6 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
   // ResLo = (CrossSum << HalfBits) + (YLo * XLo)
   Value *XLo, *YLo;
   Value *CrossSum;
-  
   // Require one-use on the multiply to avoid increasing the number of
   // multiplications.
   if (!match(&I, m_c_Add(m_Shl(m_Value(CrossSum), m_SpecificInt(HalfBits)),
diff --git a/llvm/test/Transforms/InstCombine/mul_fold.ll b/llvm/test/Transforms/InstCombine/mul_fold.ll
index d20ac6070d108..a1fdec3c68cc4 100644
--- a/llvm/test/Transforms/InstCombine/mul_fold.ll
+++ b/llvm/test/Transforms/InstCombine/mul_fold.ll
@@ -712,3 +712,30 @@ define i8 @mul8_low_miss_half_width(i8 %in0, i8 %in1) {
   %retLo = add i8 %shl, %m00
   ret i8 %retLo
 }
+
+; Test case to show shl doesn't need hasOneUse constraint
+define i32 @mul32_low_extra_shl_use(i32 %in0, i32 %in1) {
+; CHECK-LABEL: @mul32_low_extra_shl_use(
+; CHECK-NEXT:    [[IN0HI:%.*]] = lshr i32 [[IN0:%.*]], 16
+; CHECK-NEXT:    [[IN1HI:%.*]] = lshr i32 [[IN1:%.*]], 16
+; CHECK-NEXT:    [[M10:%.*]] = mul i32 [[IN1HI]], [[IN0]]
+; CHECK-NEXT:    [[M01:%.*]] = mul i32 [[IN0HI]], [[IN1]]
+; CHECK-NEXT:    [[ADDC:%.*]] = add i32 [[M10]], [[M01]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[ADDC]], 16
+; CHECK-NEXT:    call void @use32(i32 [[SHL]])
+; CHECK-NEXT:    [[RETLO:%.*]] = mul i32 [[IN0]], [[IN1]]
+; CHECK-NEXT:    ret i32 [[RETLO]]
+;
+  %In0Lo = and i32 %in0, 65535
+  %In0Hi = lshr i32 %in0, 16
+  %In1Lo = and i32 %in1, 65535
+  %In1Hi = lshr i32 %in1, 16
+  %m10 = mul i32 %In1Hi, %In0Lo
+  %m01 = mul i32 %In1Lo, %In0Hi
+  %m00 = mul i32 %In1Lo, %In0Lo
+  %addc = add i32 %m10, %m01
+  %shl = shl i32 %addc, 16
+  call void @use32(i32 %shl)
+  %retLo = add i32 %shl, %m00
+  ret i32 %retLo
+}
\ No newline at end of file