[llvm] [InstCombine] Detect different vscales in div by shift combine. (PR #126411)

Sun Feb 9 01:56:18 PST 2025

https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/126411

This attempts to fix a regression in code that performs `svcntb() / svcntw()`
(which is just a constant). https://godbolt.org/z/4o3a67s6n. We would previous
expand the svcnt into two different vscale intrinsics, CSE them in a later pass
and then fold udiv of shifts into a constant in a second instcombine.

After #121386 we now introduce a cttz. This patch just adds an additional check
for vscale to the div of shift fold, allowing it to happen earlier and avoiding
the need to look through the awkward (but probably not impossible) cttz that
was introduced.

>From 4b0a7e6c8d95bcf4cba94f7df4587c0a5e82ad01 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sat, 8 Feb 2025 23:52:23 +0000
Subject: [PATCH 1/2] [InstCombine] Add an extra test for udiv_shl_pair_const
 with vscale

---
 .../sve-intrinsic-opts-counting-elems.ll      | 17 ++++++++++++++++
 llvm/test/Transforms/InstCombine/div-shift.ll | 20 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
index 4e7e9eeb7250bcd..1ecceda9973a23c 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
@@ -240,6 +240,23 @@ define i64 @cntd_all() {
 }
 
 
+define i64 @udiv() vscale_range(1, 16) {
+; CHECK-LABEL: @udiv(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[B1:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = call range(i64 2, 65) i64 @llvm.cttz.i64(i64 [[B1]], i1 true)
+; CHECK-NEXT:    [[C:%.*]] = lshr i64 [[B]], [[TMP4]]
+; CHECK-NEXT:    ret i64 [[C]]
+;
+  %a = call i64 @llvm.aarch64.sve.cntb(i32 31)
+  %b = call i64 @llvm.aarch64.sve.cntw(i32 31)
+  %c = udiv i64 %a, %b
+  ret i64 %c
+}
+
+
 declare i64 @llvm.aarch64.sve.cntb(i32 %pattern)
 declare i64 @llvm.aarch64.sve.cnth(i32 %pattern)
 declare i64 @llvm.aarch64.sve.cntw(i32 %pattern)
diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index af83f37011ba014..5ff09195ec526d5 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -1399,3 +1399,23 @@ start:
   %div = udiv i8 %x, %y
   ret i8 %div
 }
+
+define i32 @udiv_shl_pair_const_vscale() vscale_range(1, 16) {
+; CHECK-LABEL: @udiv_shl_pair_const_vscale(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[LHS:%.*]] = shl nuw nsw i32 [[A]], 2
+; CHECK-NEXT:    [[RHS:%.*]] = shl nuw nsw i32 [[B]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 1, 33) i32 @llvm.cttz.i32(i32 [[RHS]], i1 true)
+; CHECK-NEXT:    [[DIV:%.*]] = lshr i32 [[LHS]], [[TMP0]]
+; CHECK-NEXT:    ret i32 [[DIV]]
+;
+entry:
+  %a = call i32 @llvm.vscale()
+  %b = call i32 @llvm.vscale()
+  %lhs = shl nuw i32 %a, 2
+  %rhs = shl nuw i32 %b, 1
+  %div = udiv i32 %lhs, %rhs
+  ret i32 %div
+}

>From 0e8caf4f1e96b188e9a9dff002ab5504687ad6ae Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sun, 9 Feb 2025 09:47:18 +0000
Subject: [PATCH 2/2] [InstCombine] Detect different vscales in div by shift
 combine.

This attempts to fix a regression in code that performs `svcntb() / svcntw()`
(which is just a constant). https://godbolt.org/z/4o3a67s6n. We would previous
expand the svcnt into two different vscale intrinsics, CSE them in a later pass
and then fold udiv of shifts into a constant in a second instcombine.

After #121386 we now introduce a cttz. This patch just adds an additional check
for vscale to the div of shift fold, allowing it to happen earlier and avoiding
the need to look through the awkward (but probably not impossible) cttz that
was introduced.
---
 llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp  | 8 +++++---
 .../AArch64/sve-intrinsic-opts-counting-elems.ll          | 8 +-------
 llvm/test/Transforms/InstCombine/div-shift.ll             | 8 +-------
 3 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index c8bdf029dd71c37..b2382eb7aed3196 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1205,14 +1205,16 @@ static Value *foldIDivShl(BinaryOperator &I, InstCombiner::BuilderTy &Builder) {
 
   // If X << Y and X << Z does not overflow, then:
   // (X << Y) / (X << Z) -> (1 << Y) / (1 << Z) -> 1 << Y >> Z
-  if (match(Op0, m_Shl(m_Value(X), m_Value(Y))) &&
-      match(Op1, m_Shl(m_Specific(X), m_Value(Z)))) {
+  if ((match(Op0, m_Shl(m_Value(X), m_Value(Y))) &&
+       match(Op1, m_Shl(m_Specific(X), m_Value(Z)))) ||
+      (match(Op0, m_Shl(m_VScale(), m_Value(Y))) &&
+       match(Op1, m_Shl(m_VScale(), m_Value(Z))))) {
     auto *Shl0 = cast<OverflowingBinaryOperator>(Op0);
     auto *Shl1 = cast<OverflowingBinaryOperator>(Op1);
 
     if (IsSigned ? (Shl0->hasNoSignedWrap() && Shl1->hasNoSignedWrap())
                  : (Shl0->hasNoUnsignedWrap() && Shl1->hasNoUnsignedWrap())) {
-      Constant *One = ConstantInt::get(X->getType(), 1);
+      Constant *One = ConstantInt::get(Op0->getType(), 1);
       // Only preserve the nsw flag if dividend has nsw
       // or divisor has nsw and operator is sdiv.
       Value *Dividend = Builder.CreateShl(
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
index 1ecceda9973a23c..a398997a3453d2f 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-counting-elems.ll
@@ -242,13 +242,7 @@ define i64 @cntd_all() {
 
 define i64 @udiv() vscale_range(1, 16) {
 ; CHECK-LABEL: @udiv(
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i64 [[TMP2]], 4
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[B1:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = call range(i64 2, 65) i64 @llvm.cttz.i64(i64 [[B1]], i1 true)
-; CHECK-NEXT:    [[C:%.*]] = lshr i64 [[B]], [[TMP4]]
-; CHECK-NEXT:    ret i64 [[C]]
+; CHECK-NEXT:    ret i64 4
 ;
   %a = call i64 @llvm.aarch64.sve.cntb(i32 31)
   %b = call i64 @llvm.aarch64.sve.cntw(i32 31)
diff --git a/llvm/test/Transforms/InstCombine/div-shift.ll b/llvm/test/Transforms/InstCombine/div-shift.ll
index 5ff09195ec526d5..f42aaa1c60750e4 100644
--- a/llvm/test/Transforms/InstCombine/div-shift.ll
+++ b/llvm/test/Transforms/InstCombine/div-shift.ll
@@ -1403,13 +1403,7 @@ start:
 define i32 @udiv_shl_pair_const_vscale() vscale_range(1, 16) {
 ; CHECK-LABEL: @udiv_shl_pair_const_vscale(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[B:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[LHS:%.*]] = shl nuw nsw i32 [[A]], 2
-; CHECK-NEXT:    [[RHS:%.*]] = shl nuw nsw i32 [[B]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = call range(i32 1, 33) i32 @llvm.cttz.i32(i32 [[RHS]], i1 true)
-; CHECK-NEXT:    [[DIV:%.*]] = lshr i32 [[LHS]], [[TMP0]]
-; CHECK-NEXT:    ret i32 [[DIV]]
+; CHECK-NEXT:    ret i32 2
 ;
 entry:
   %a = call i32 @llvm.vscale()