[llvm] 9708d88 - Revert rG42230efccf8fe1185be5fa6c23dce0a8183d6ec9 "[DAG] Fold (sra (or (shl x, c1), (shl y, c2)), c1) -> (sext_inreg (or x, (shl y,c2-c1)) iff c2 >= c1"

Wed Oct 19 04:07:55 PDT 2022

Author: Simon Pilgrim
Date: 2022-10-19T12:07:41+01:00
New Revision: 9708d88017d0c9adaea65a4f5a5b589b67f292e2

URL: https://github.com/llvm/llvm-project/commit/9708d88017d0c9adaea65a4f5a5b589b67f292e2
DIFF: https://github.com/llvm/llvm-project/commit/9708d88017d0c9adaea65a4f5a5b589b67f292e2.diff

LOG: Revert rG42230efccf8fe1185be5fa6c23dce0a8183d6ec9 "[DAG] Fold (sra (or (shl x, c1), (shl y, c2)), c1) -> (sext_inreg (or x, (shl y,c2-c1)) iff c2 >= c1"

@foad was right - this isn't actually going to help with D136042 as much as hoped, we need a better AMDGPU-specific solution as other targets are likely to make use of it

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AMDGPU/bfe-patterns.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6b2662180b81f..33136452d4f2e 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9433,49 +9433,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
         TargetLowering::Legal)
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          N0.getOperand(0), DAG.getValueType(ExtVT));
-
     // Even if we can't convert to sext_inreg, we might be able to remove
     // this shift pair if the input is already sign extended.
     if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue())
       return N0.getOperand(0);
   }
 
-  // fold (sra (or (shl x, c1), (shl y, c2)), c1)
-  //  -> (sext_inreg (or x, (shl y,c2-c1))
-  // for some c1 and target supports sext_inreg.
-  if (N1C && N0.getOpcode() == ISD::OR &&
-      N0.getOperand(0).getOpcode() == ISD::SHL &&
-      N0.getOperand(1).getOpcode() == ISD::SHL &&
-      (N1 == N0.getOperand(0).getOperand(1) ||
-       N1 == N0.getOperand(1).getOperand(1)) &&
-      N0->hasOneUse() && N0.getOperand(0)->hasOneUse() &&
-      N0.getOperand(1)->hasOneUse()) {
-    ConstantSDNode *N00C = isConstOrConstSplat(N0.getOperand(0).getOperand(1));
-    ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1).getOperand(1));
-    if (N00C && N01C &&
-        N00C->getAPIntValue().uge(N1C->getZExtValue()) &&
-        N01C->getAPIntValue().uge(N1C->getZExtValue())) {
-      unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
-      EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
-      if (VT.isVector())
-        ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT,
-                                 VT.getVectorElementCount());
-      if (!LegalOperations ||
-          TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
-              TargetLowering::Legal) {
-        // Apply SRL on top of the SHL nodes, SimplifyDemandedBits will clean
-        // this up. It looks messy but its a lot simpler than handling all the
-        // possible shift value type mismatches we could have....
-        SDLoc DL(N);
-        SDValue LHS = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
-        SDValue RHS = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(1), N1);
-        SDValue Or = DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
-        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Or,
-                           DAG.getValueType(ExtVT));
-      }
-    }
-  }
-
   // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
   // clamp (add c1, c2) to max shift.
   if (N0.getOpcode() == ISD::SRA) {

diff  --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index fe242b6635451..1c72c08a34c7b 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -440,38 +440,40 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(i32 addrspace(1)* %out,
   ret void
 }
 
-; ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
+; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1
 define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x, i32 addrspace(1)* %y) {
 ; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
-; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_load_dword s2, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 2
-; SI-NEXT:    s_or_b32 s2, s2, s4
-; SI-NEXT:    s_bfe_i32 s4, s2, 0xf0000
-; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_lshl_b32 s1, s2, 17
+; SI-NEXT:    s_lshl_b32 s0, s0, 19
+; SI-NEXT:    s_or_b32 s0, s1, s0
+; SI-NEXT:    s_ashr_i32 s0, s0, 17
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_load_dword s2, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s0, s4, 2
-; VI-NEXT:    s_or_b32 s0, s2, s0
-; VI-NEXT:    s_bfe_i32 s0, s0, 0xf0000
+; VI-NEXT:    s_lshl_b32 s1, s2, 17
+; VI-NEXT:    s_lshl_b32 s0, s0, 19
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_ashr_i32 s0, s0, 17
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm