[llvm] df45c18 - [DAG] PromoteIntRes_ADDSUBSHLSAT - promote ISD::UADDSAT as clamped add

Tue Feb 16 09:38:28 PST 2021

Author: Simon Pilgrim
Date: 2021-02-16T17:37:44Z
New Revision: df45c1813509fc579c5e0a4f2824923db92c2dbb

URL: https://github.com/llvm/llvm-project/commit/df45c1813509fc579c5e0a4f2824923db92c2dbb
DIFF: https://github.com/llvm/llvm-project/commit/df45c1813509fc579c5e0a4f2824923db92c2dbb.diff

LOG: [DAG] PromoteIntRes_ADDSUBSHLSAT - promote ISD::UADDSAT as clamped add

Similar to D96622, we're better off just promoting uaddsat(x,y) -> umin(add(x,y),c) instead of trying to perform a shifted uaddsat.

I initially tried to just use shifted promotion in cases where we didn't have a legal/custom umin - but we don't appear to have any targets that have uaddsat but not umin, so imo we're better off always using the umin and avoid an untested shifted uaddsat code path.

Differential Revision: https://reviews.llvm.org/D96767

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
    llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
    llvm/test/CodeGen/AMDGPU/uaddsat.ll
    llvm/test/CodeGen/X86/uadd_sat_vec.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index deeddb45e047..52f326c10bc0 100644

--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -784,6 +784,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   EVT PromotedType = Op1Promoted.getValueType();
   unsigned NewBits = PromotedType.getScalarSizeInBits();
 
+  if (Opcode == ISD::UADDSAT) {
+    APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+    SDValue Add =
+        DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+  }
+
   // USUBSAT can always be promoted as long as we have zero-extended the args.
   if (Opcode == ISD::USUBSAT)
     return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
@@ -799,7 +807,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
     case ISD::SSHLSAT:
       ShiftOp = ISD::SRA;
       break;
-    case ISD::UADDSAT:
     case ISD::USHLSAT:
       ShiftOp = ISD::SRL;
       break;
@@ -822,14 +829,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
     return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
   }
 
-  if (Opcode == ISD::UADDSAT) {
-    APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
-    SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
-    SDValue Add =
-        DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
-    return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
-  }
-
   unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
   APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits);
   APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits);

diff  --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 43a32b43b585..21427a6a92d7 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -116,22 +116,21 @@ define void @v4i8(<4 x i8>* %px, <4 x i8>* %py, <4 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    ldrb w9, [x1]
 ; CHECK-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-NEXT:    ldrb w11, [x1, #1]
+; CHECK-NEXT:    ldrb w12, [x0, #2]
 ; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ldrb w8, [x1, #2]
 ; CHECK-NEXT:    fmov s1, w9
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrb w9, [x1, #2]
 ; CHECK-NEXT:    mov v0.h[1], w10
+; CHECK-NEXT:    ldrb w9, [x0, #3]
+; CHECK-NEXT:    ldrb w10, [x1, #3]
 ; CHECK-NEXT:    mov v1.h[1], w11
-; CHECK-NEXT:    ldrb w10, [x0, #3]
-; CHECK-NEXT:    ldrb w11, [x1, #3]
-; CHECK-NEXT:    mov v0.h[2], w8
-; CHECK-NEXT:    mov v1.h[2], w9
-; CHECK-NEXT:    mov v0.h[3], w10
-; CHECK-NEXT:    mov v1.h[3], w11
-; CHECK-NEXT:    shl v1.4h, v1.4h, #8
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    uqadd v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    ushr v0.4h, v0.4h, #8
+; CHECK-NEXT:    mov v0.h[2], w12
+; CHECK-NEXT:    mov v1.h[2], w8
+; CHECK-NEXT:    mov v0.h[3], w9
+; CHECK-NEXT:    mov v1.h[3], w10
+; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umin v0.4h, v0.4h, v2.4h
 ; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
@@ -150,13 +149,12 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind {
 ; CHECK-NEXT:    ldrb w10, [x0, #1]
 ; CHECK-NEXT:    ldrb w11, [x1, #1]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    shl v1.2s, v1.2s, #24
-; CHECK-NEXT:    shl v0.2s, v0.2s, #24
-; CHECK-NEXT:    uqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #24
+; CHECK-NEXT:    mov v2.s[1], w11
+; CHECK-NEXT:    movi d1, #0x0000ff000000ff
+; CHECK-NEXT:    add v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    umin v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strb w9, [x2]
@@ -192,13 +190,12 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind {
 ; CHECK-NEXT:    ldrh w10, [x0, #2]
 ; CHECK-NEXT:    ldrh w11, [x1, #2]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fmov s2, w9
 ; CHECK-NEXT:    mov v0.s[1], w10
-; CHECK-NEXT:    mov v1.s[1], w11
-; CHECK-NEXT:    shl v1.2s, v1.2s, #16
-; CHECK-NEXT:    shl v0.2s, v0.2s, #16
-; CHECK-NEXT:    uqadd v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #16
+; CHECK-NEXT:    mov v2.s[1], w11
+; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    add v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    umin v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    mov w8, v0.s[1]
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    strh w9, [x2]
@@ -271,12 +268,10 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; CHECK-LABEL: v16i4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.16b, #15
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    shl v1.16b, v1.16b, #4
-; CHECK-NEXT:    shl v0.16b, v0.16b, #4
-; CHECK-NEXT:    uqadd v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushr v0.16b, v0.16b, #4
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    umin v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z

diff  --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 5447d5ea0b75..5082772fae98 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -17,19 +17,15 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
 ; GFX8-LABEL: v_uaddsat_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX8-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT:    v_min_u16_e32 v0, 0xff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_uaddsat_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:    v_add_u16_e64 v0, v0, v1 clamp
-; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_min_u16_e32 v0, 0xff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
   ret i8 %result

diff  --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 5624dfd80844..d6d090352b4b 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -481,26 +481,20 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; SSE-LABEL: v16i4:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psllw $4, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    psllw $4, %xmm0
 ; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    paddusb %xmm1, %xmm0
-; SSE-NEXT:    psrlw $4, %xmm0
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    pminub %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v16i4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
   ret <16 x i4> %z