[llvm] [SimplifyCFG] Treat umul + extract pattern as cheap single instruction (Approach 2) (PR #128021)

Thu Feb 20 07:50:03 PST 2025

https://github.com/spaits created https://github.com/llvm/llvm-project/pull/128021

Fix issue #115683 .

Overflow arithmetic instruction plus extract value are usually generated when a division is being replaced, but the zero check may still be there. In that case hoist these two instructions out of this basic block, and let later optimizations take care of the unnecessary zero checks.

An example:
```llvm
define i1 @src(i8 %x, i8 %y) zeroext {
entry:
  %cmp.not = icmp eq i8 %y, 0
  br i1 %cmp.not, label %land.end, label %land.rhs

land.rhs:
  %mul = umul_overflow i8 %y, %x
  %mul.ov = extractvalue {i8, i1} %mul, 1
  br label %land.end

land.end:
  %#0 = phi i1 [ 0, %entry ], [ %mul.ov, %land.rhs ]
  ret i1 %#0
}
=>
define i1 @tgt(i8 %x, i8 %y) zeroext {
entry:
  %fx = freeze i8 %x
  %mul = umul_overflow i8 %y, %fx
  %mul.ov = extractvalue {i8, i1} %mul, 1
  ret i1 %mul.ov
}
```

>From 9b11238b239526b97a551250edbdabdd9ba94442 Mon Sep 17 00:00:00 2001
From: Gabor Spaits <gaborspaits1 at gmail.com>
Date: Thu, 20 Feb 2025 16:48:13 +0100
Subject: [PATCH] [SimplifyCFG] Treat umul + extract pattern as cheap single
 instruction

---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 24 +++++---
 .../X86/unsigned-multiply-overflow-check.ll   | 34 +++---------
 .../SimplifyCFG/umul-extract-pattern.ll       | 55 +++++++++++++++++++
 3 files changed, 78 insertions(+), 35 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/umul-extract-pattern.ll

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 12dd49da279b9..157716dc7ead9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -421,11 +421,11 @@ static InstructionCost computeSpeculationCost(const User *I,
 /// After this function returns, Cost is increased by the cost of
 /// V plus its non-dominating operands.  If that cost is greater than
 /// Budget, false is returned and Cost is undefined.
-static bool dominatesMergePoint(Value *V, BasicBlock *BB, Instruction *InsertPt,
-                                SmallPtrSetImpl<Instruction *> &AggressiveInsts,
-                                InstructionCost &Cost, InstructionCost Budget,
-                                const TargetTransformInfo &TTI,
-                                AssumptionCache *AC, unsigned Depth = 0) {
+static bool dominatesMergePoint(
+    Value *V, BasicBlock *BB, Instruction *InsertPt,
+    SmallPtrSetImpl<Instruction *> &AggressiveInsts, InstructionCost &Cost,
+    InstructionCost Budget, const TargetTransformInfo &TTI, AssumptionCache *AC,
+    SmallPtrSetImpl<Instruction *> &ZeroCostInstructions, unsigned Depth = 0) {
   // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
   // so limit the recursion depth.
   // TODO: While this recursion limit does prevent pathological behavior, it
@@ -463,7 +463,12 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB, Instruction *InsertPt,
   if (!isSafeToSpeculativelyExecute(I, InsertPt, AC))
     return false;
 
-  Cost += computeSpeculationCost(I, TTI);
+  WithOverflowInst *OverflowInst;
+  if (match(I, m_ExtractValue<1>(m_OneUse(m_WithOverflowInst(OverflowInst))))) {
+    ZeroCostInstructions.insert(OverflowInst);
+    Cost += 1;
+  } else if (!ZeroCostInstructions.contains(I))
+    Cost += computeSpeculationCost(I, TTI);
 
   // Allow exactly one instruction to be speculated regardless of its cost
   // (as long as it is safe to do so).
@@ -480,7 +485,7 @@ static bool dominatesMergePoint(Value *V, BasicBlock *BB, Instruction *InsertPt,
   // not take us over the cost threshold.
   for (Use &Op : I->operands())
     if (!dominatesMergePoint(Op, BB, InsertPt, AggressiveInsts, Cost, Budget,
-                             TTI, AC, Depth + 1))
+                             TTI, AC, ZeroCostInstructions, Depth + 1))
       return false;
   // Okay, it's safe to do this!  Remember this instruction.
   AggressiveInsts.insert(I);
@@ -3810,6 +3815,7 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction *, 4> AggressiveInsts;
+  SmallPtrSet<Instruction *, 2> ZeroCostInstructions;
   InstructionCost Cost = 0;
   InstructionCost Budget =
       TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
@@ -3827,9 +3833,9 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
     }
 
     if (!dominatesMergePoint(PN->getIncomingValue(0), BB, DomBI,
-                             AggressiveInsts, Cost, Budget, TTI, AC) ||
+                             AggressiveInsts, Cost, Budget, TTI, AC, ZeroCostInstructions) ||
         !dominatesMergePoint(PN->getIncomingValue(1), BB, DomBI,
-                             AggressiveInsts, Cost, Budget, TTI, AC))
+                             AggressiveInsts, Cost, Budget, TTI, AC, ZeroCostInstructions))
       return Changed;
   }
 
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/unsigned-multiply-overflow-check.ll b/llvm/test/Transforms/PhaseOrdering/X86/unsigned-multiply-overflow-check.ll
index 7bcb6ce17df0e..9858591dfc700 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/unsigned-multiply-overflow-check.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/unsigned-multiply-overflow-check.ll
@@ -45,26 +45,17 @@ define i1 @will_not_overflow(i64 %arg, i64 %arg1) {
 ; INSTCOMBINESIMPLIFYCFGONLY-LABEL: @will_not_overflow(
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:  bb:
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[T0:%.*]] = icmp eq i64 [[ARG:%.*]], 0
-; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    br i1 [[T0]], label [[BB5:%.*]], label [[BB2:%.*]]
-; INSTCOMBINESIMPLIFYCFGONLY:       bb2:
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ARG]], i64 [[ARG1:%.*]])
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[MUL_OV:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
-; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    br label [[BB5]]
-; INSTCOMBINESIMPLIFYCFGONLY:       bb5:
-; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[T6:%.*]] = phi i1 [ false, [[BB:%.*]] ], [ [[MUL_OV]], [[BB2]] ]
+; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[T6:%.*]] = select i1 [[T0]], i1 false, i1 [[MUL_OV]]
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    ret i1 [[T6]]
 ;
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-LABEL: @will_not_overflow(
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:  bb:
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[T0:%.*]] = icmp eq i64 [[ARG:%.*]], 0
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    br i1 [[T0]], label [[BB5:%.*]], label [[BB2:%.*]]
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE:       bb2:
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ARG]], i64 [[ARG1:%.*]])
+; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[ARG1:%.*]] = freeze i64 [[ARG2:%.*]]
+; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ARG:%.*]], i64 [[ARG1]])
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[MUL_OV:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    br label [[BB5]]
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE:       bb5:
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[T6:%.*]] = phi i1 [ false, [[BB:%.*]] ], [ [[MUL_OV]], [[BB2]] ]
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    ret i1 [[T6]]
+; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    ret i1 [[MUL_OV]]
 ;
 bb:
   %t0 = icmp eq i64 %arg, 0
@@ -112,28 +103,19 @@ define i1 @will_overflow(i64 %arg, i64 %arg1) {
 ; INSTCOMBINESIMPLIFYCFGONLY-LABEL: @will_overflow(
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:  bb:
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[T0:%.*]] = icmp eq i64 [[ARG:%.*]], 0
-; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    br i1 [[T0]], label [[BB5:%.*]], label [[BB2:%.*]]
-; INSTCOMBINESIMPLIFYCFGONLY:       bb2:
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ARG]], i64 [[ARG1:%.*]])
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[MUL_OV:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[PHI_BO:%.*]] = xor i1 [[MUL_OV]], true
-; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    br label [[BB5]]
-; INSTCOMBINESIMPLIFYCFGONLY:       bb5:
-; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[T6:%.*]] = phi i1 [ true, [[BB:%.*]] ], [ [[PHI_BO]], [[BB2]] ]
+; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    [[T6:%.*]] = select i1 [[T0]], i1 true, i1 [[PHI_BO]]
 ; INSTCOMBINESIMPLIFYCFGONLY-NEXT:    ret i1 [[T6]]
 ;
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-LABEL: @will_overflow(
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:  bb:
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[T0:%.*]] = icmp eq i64 [[ARG:%.*]], 0
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    br i1 [[T0]], label [[BB5:%.*]], label [[BB2:%.*]]
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE:       bb2:
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ARG]], i64 [[ARG1:%.*]])
+; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[ARG1:%.*]] = freeze i64 [[ARG2:%.*]]
+; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ARG:%.*]], i64 [[ARG1]])
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[MUL_OV:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
 ; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[PHI_BO:%.*]] = xor i1 [[MUL_OV]], true
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    br label [[BB5]]
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE:       bb5:
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    [[T6:%.*]] = phi i1 [ true, [[BB:%.*]] ], [ [[PHI_BO]], [[BB2]] ]
-; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    ret i1 [[T6]]
+; INSTCOMBINESIMPLIFYCFGINSTCOMBINE-NEXT:    ret i1 [[PHI_BO]]
 ;
 bb:
   %t0 = icmp eq i64 %arg, 0
diff --git a/llvm/test/Transforms/SimplifyCFG/umul-extract-pattern.ll b/llvm/test/Transforms/SimplifyCFG/umul-extract-pattern.ll
new file mode 100644
index 0000000000000..8a5e034998a25
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/umul-extract-pattern.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s
+target triple = "riscv64-unknown-unknown-elf"
+
+define i16 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: @func2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[MUL:%.*]] = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[Y]], i64 [[X:%.*]])
+; CHECK-NEXT:    [[MUL_OV:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_NOT]], i1 false, i1 [[MUL_OV]]
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[SPEC_SELECT]] to i16
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+entry:
+  %cmp.not = icmp eq i64 %y, 0
+  br i1 %cmp.not, label %land.end, label %land.rhs
+
+land.rhs:                                         ; preds = %entry
+  %mul = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %y, i64 %x)
+  %mul.ov = extractvalue { i64, i1 } %mul, 1
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %0 = phi i1 [ false, %entry ], [ %mul.ov, %land.rhs ]
+  %conv = zext i1 %0 to i16
+  ret i16 %conv
+}
+
+define i16 @noHoist(i64 %x, i64 %y) {
+; CHECK-LABEL: @noHoist(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[Y:%.*]], 0
+; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i64 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[ADD2]], i64 [[X]])
+; CHECK-NEXT:    [[MUL_OV:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP_NOT]], i1 false, i1 [[MUL_OV]]
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[SPEC_SELECT]] to i16
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+entry:
+  %cmp.not = icmp eq i64 %y, 0
+  br i1 %cmp.not, label %land.end, label %land.rhs
+
+land.rhs:                                   ; preds = %entry
+  %add = add nsw i64 %y, %x
+  %mul = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %add, i64 %x)
+  %mul.ov = extractvalue { i64, i1 } %mul, 1
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %0 = phi i1 [ false, %entry ], [ %mul.ov, %land.rhs ]
+  %conv = zext i1 %0 to i16
+  ret i16 %conv
+}