[llvm] a983ef2 - [DAGCombiner][AArch64][VE] Teach BuildUDIV/SDIV to use 2x mul when mulh/mul_lohi are not available.

Fri May 12 09:06:25 PDT 2023

Author: Craig Topper
Date: 2023-05-12T09:06:17-07:00
New Revision: a983ef2c1743d8c8240b83ab307d7adcbaa73693

URL: https://github.com/llvm/llvm-project/commit/a983ef2c1743d8c8240b83ab307d7adcbaa73693
DIFF: https://github.com/llvm/llvm-project/commit/a983ef2c1743d8c8240b83ab307d7adcbaa73693.diff

LOG: [DAGCombiner][AArch64][VE] Teach BuildUDIV/SDIV to use 2x mul when mulh/mul_lohi are not available.

Correct the legality of i32 mul_lohi on AArch64.

Previously, AArch64 incorrectly reported i32 mul_lohi as Legal.
This allowed BuildUDIV/SDIV to use them. A later DAGCombiner would
replace them with MULHS/MULHU because only the high half was used.
This conversion does not check the legality of MULHS/MULHU under
the assumption that LegalizeDAG can turn it back into MUL_LOHI later.

After they are converted to MULHS/MULHU, DAGCombine ran and saw that
these operations aren't supported but an i64 MUL is. So they get
converted to that plus a shift. Without this, LegalizeDAG would
convert back MUL_LOHI and isel would fail to find a pattern.

This patch teaches BuildUDIV/SDIV to create the wide mul and shift
so that we can report the correct operation legality on AArch64. It
also enables div by constant folding for more cases on VE.

I don't know if VE wants this div by constant optimization or not. If they
don't want it, they can use the isIntDivCheap hook to disable it.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D150333

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/VE/Scalar/div.ll
    llvm/test/CodeGen/VE/Scalar/rem.ll
    llvm/test/CodeGen/VE/Vector/vec_divrem.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7c78699e43a4d..74e67bc744f21 100644

--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5990,6 +5990,19 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
           DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
       return SDValue(LoHi.getNode(), 1);
     }
+    // If type twice as wide legal, widen and use a mul plus a shift.
+    if (!VT.isVector()) {
+      unsigned Size = VT.getSizeInBits();
+      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2);
+      if (isOperationLegal(ISD::MUL, WideVT)) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X);
+        Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y);
+        Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
+        Y = DAG.getNode(ISD::SRL, dl, WideVT, Y,
+                        DAG.getShiftAmountConstant(EltBits, WideVT, dl));
+        return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+      }
+    }
     return SDValue();
   };
 
@@ -6163,6 +6176,19 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
           DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
       return SDValue(LoHi.getNode(), 1);
     }
+    // If type twice as wide legal, widen and use a mul plus a shift.
+    if (!VT.isVector()) {
+      unsigned Size = VT.getSizeInBits();
+      EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2);
+      if (isOperationLegal(ISD::MUL, WideVT)) {
+        X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X);
+        Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y);
+        Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
+        Y = DAG.getNode(ISD::SRL, dl, WideVT, Y,
+                        DAG.getShiftAmountConstant(EltBits, WideVT, dl));
+        return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+      }
+    }
     return SDValue(); // No mulhu or equivalent
   };
 

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1aacab3ff6855..093d7984e7cd2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -580,6 +580,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS, MVT::i32, Expand);
 
   // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 

diff  --git a/llvm/test/CodeGen/VE/Scalar/div.ll b/llvm/test/CodeGen/VE/Scalar/div.ll
index f9ed91a013700..64caf8a835468 100644
--- a/llvm/test/CodeGen/VE/Scalar/div.ll
+++ b/llvm/test/CodeGen/VE/Scalar/div.ll
@@ -149,7 +149,11 @@ define i64 @divi64ri(i64 %a, i64 %b) {
 define signext i32 @divi32ri(i32 signext %a, i32 signext %b) {
 ; CHECK-LABEL: divi32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divs.w.sx %s0, %s0, (62)0
+; CHECK-NEXT:    lea %s1, 1431655766
+; CHECK-NEXT:    muls.l %s0, %s0, %s1
+; CHECK-NEXT:    srl %s1, %s0, 63
+; CHECK-NEXT:    srl %s0, %s0, 32
+; CHECK-NEXT:    adds.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = sdiv i32 %a, 3
@@ -185,8 +189,10 @@ define i64 @divu64ri(i64 %a, i64 %b) {
 define zeroext i32 @divu32ri(i32 zeroext %a, i32 zeroext %b) {
 ; CHECK-LABEL: divu32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divu.w %s0, %s0, (62)0
-; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
+; CHECK-NEXT:    lea %s1, -1431655765
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    muls.l %s0, %s0, %s1
+; CHECK-NEXT:    srl %s0, %s0, 33
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = udiv i32 %a, 3
   ret i32 %r

diff  --git a/llvm/test/CodeGen/VE/Scalar/rem.ll b/llvm/test/CodeGen/VE/Scalar/rem.ll
index 793c1d7c32b2d..9911405c6a68d 100644
--- a/llvm/test/CodeGen/VE/Scalar/rem.ll
+++ b/llvm/test/CodeGen/VE/Scalar/rem.ll
@@ -165,7 +165,11 @@ define i64 @remi64ri(i64 %a) {
 define signext i32 @remi32ri(i32 signext %a) {
 ; CHECK-LABEL: remi32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divs.w.sx %s1, %s0, (62)0
+; CHECK-NEXT:    lea %s1, 1431655766
+; CHECK-NEXT:    muls.l %s1, %s0, %s1
+; CHECK-NEXT:    srl %s2, %s1, 63
+; CHECK-NEXT:    srl %s1, %s1, 32
+; CHECK-NEXT:    adds.w.sx %s1, %s1, %s2
 ; CHECK-NEXT:    muls.w.sx %s1, 3, %s1
 ; CHECK-NEXT:    subs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    adds.w.sx %s0, %s0, (0)1
@@ -205,7 +209,10 @@ define i64 @remu64ri(i64 %a) {
 define zeroext i32 @remu32ri(i32 zeroext %a) {
 ; CHECK-LABEL: remu32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    divu.w %s1, %s0, (62)0
+; CHECK-NEXT:    lea %s1, -1431655765
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    muls.l %s1, %s0, %s1
+; CHECK-NEXT:    srl %s1, %s1, 33
 ; CHECK-NEXT:    muls.w.sx %s1, 3, %s1
 ; CHECK-NEXT:    subs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1

diff  --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index e967225fd31ed..3bc0aba8d4264 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -8,13 +8,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
 ; CHECK-LABEL: udiv_by_minus_one:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (56)0
+; CHECK-NEXT:    lea %s4, 16843010
+; CHECK-NEXT:    muls.l %s0, %s0, %s4
+; CHECK-NEXT:    srl %s0, %s0, 32
 ; CHECK-NEXT:    and %s1, %s1, (56)0
+; CHECK-NEXT:    muls.l %s1, %s1, %s4
+; CHECK-NEXT:    srl %s1, %s1, 32
 ; CHECK-NEXT:    and %s2, %s2, (56)0
+; CHECK-NEXT:    muls.l %s2, %s2, %s4
+; CHECK-NEXT:    srl %s2, %s2, 32
 ; CHECK-NEXT:    and %s3, %s3, (56)0
-; CHECK-NEXT:    divu.w %s3, %s3, (56)0
-; CHECK-NEXT:    divu.w %s2, %s2, (56)0
-; CHECK-NEXT:    divu.w %s1, %s1, (56)0
-; CHECK-NEXT:    divu.w %s0, %s0, (56)0
+; CHECK-NEXT:    muls.l %s3, %s3, %s4
+; CHECK-NEXT:    srl %s3, %s3, 32
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
   ret <4 x i8> %r
@@ -27,16 +32,21 @@ define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
 ; CHECK-NEXT:    and %s1, %s1, (56)0
 ; CHECK-NEXT:    and %s2, %s2, (56)0
 ; CHECK-NEXT:    and %s3, %s3, (56)0
-; CHECK-NEXT:    divu.w %s4, %s3, (56)0
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s3, %s3, %s4
-; CHECK-NEXT:    divu.w %s4, %s2, (56)0
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s2, %s2, %s4
-; CHECK-NEXT:    divu.w %s4, %s1, (56)0
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s1, %s1, %s4
-; CHECK-NEXT:    divu.w %s4, %s0, (56)0
+; CHECK-NEXT:    lea %s4, 16843010
+; CHECK-NEXT:    muls.l %s5, %s3, %s4
+; CHECK-NEXT:    srl %s5, %s5, 32
+; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s3, %s3, %s5
+; CHECK-NEXT:    muls.l %s5, %s2, %s4
+; CHECK-NEXT:    srl %s5, %s5, 32
+; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s2, %s2, %s5
+; CHECK-NEXT:    muls.l %s5, %s1, %s4
+; CHECK-NEXT:    srl %s5, %s5, 32
+; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
+; CHECK-NEXT:    subs.w.sx %s1, %s1, %s5
+; CHECK-NEXT:    muls.l %s4, %s0, %s4
+; CHECK-NEXT:    srl %s4, %s4, 32
 ; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
 ; CHECK-NEXT:    subs.w.sx %s0, %s0, %s4
 ; CHECK-NEXT:    b.l.t (, %s10)