[llvm] [DAGCombiner][ARM] Teach reduceLoadWidth to handle (and (srl (load), C, ShiftedMask)) (PR #80342)

Thu Feb 1 13:23:01 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-arm

Author: Craig Topper (topperc)

<details>
<summary>Changes</summary>

If we have a shifted mask, we may be able to reduce the load width
to the width of the non-zero part of the mask and use an offset
to the base address to remove the srl. The offset is given by
C+trailingzeros(ShiftedMask).
    
Then we add a final shl to restore the trailing zero bits.
    
I've use the ARM test because that's where the existing (and (srl (load))) tests were.
    
The X86 test was modified to keep the H register.

---
Full diff: https://github.com/llvm/llvm-project/pull/80342.diff


3 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+17-4) 
- (modified) llvm/test/CodeGen/ARM/shift-combine.ll (+112) 
- (modified) llvm/test/CodeGen/X86/h-registers-2.ll (+3-1) 


``````````diff

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b17724cd07209..6d669ccdb1f99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14198,7 +14198,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   // away, but using an AND rather than a right shift. HasShiftedOffset is used
   // to indicate that the narrowed load should be left-shifted ShAmt bits to get
   // the result.
-  bool HasShiftedOffset = false;
+  unsigned ShiftedOffset = 0;
   // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
   // extended to VT.
   if (Opc == ISD::SIGN_EXTEND_INREG) {
@@ -14243,7 +14243,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
     if (Mask.isMask()) {
       ActiveBits = Mask.countr_one();
     } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
-      HasShiftedOffset = true;
+      ShiftedOffset = ShAmt;
     } else {
       return SDValue();
     }
@@ -14307,6 +14307,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
     SDNode *Mask = *(SRL->use_begin());
     if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
         isa<ConstantSDNode>(Mask->getOperand(1))) {
+      unsigned Offset, ActiveBits;
       const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
       if (ShiftMask.isMask()) {
         EVT MaskedVT =
@@ -14315,6 +14316,18 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
         if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
             TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
           ExtVT = MaskedVT;
+      } else if (ExtType == ISD::ZEXTLOAD &&
+                 ShiftMask.isShiftedMask(Offset, ActiveBits) &&
+                 (Offset + ShAmt) < VT.getSizeInBits()) {
+        EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+        // If the mask is shifted we can use a narrower load and a shl to insert
+        // the trailing zeros.
+        if (((Offset + ActiveBits) <= ExtVT.getSizeInBits()) &&
+            TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) {
+          ExtVT = MaskedVT;
+          ShAmt = Offset + ShAmt;
+          ShiftedOffset = Offset;
+        }
       }
     }
 
@@ -14400,12 +14413,12 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
                           Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
   }
 
-  if (HasShiftedOffset) {
+  if (ShiftedOffset != 0) {
     // We're using a shifted mask, so the load now has an offset. This means
     // that data has been loaded into the lower bytes than it would have been
     // before, so we need to shl the loaded data into the correct position in the
     // register.
-    SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
+    SDValue ShiftC = DAG.getConstant(ShiftedOffset, DL, VT);
     Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
   }
diff --git a/llvm/test/CodeGen/ARM/shift-combine.ll b/llvm/test/CodeGen/ARM/shift-combine.ll
index 0dd5007b4a413..66417cddd4d56 100644
--- a/llvm/test/CodeGen/ARM/shift-combine.ll
+++ b/llvm/test/CodeGen/ARM/shift-combine.ll
@@ -1278,3 +1278,115 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
   %r = or <4 x i32> %or.ab, %or.cd
   ret <4 x i32> %r
 }
+
+define arm_aapcscc i32 @test_shift15_and510(ptr nocapture %p) {
+; CHECK-ARM-LABEL: test_shift15_and510:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldrb r0, [r0, #2]
+; CHECK-ARM-NEXT:    lsl r0, r0, #1
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_shift15_and510:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0, #1]
+; CHECK-BE-NEXT:    lsl r0, r0, #1
+; CHECK-BE-NEXT:    bx lr
+;
+; CHECK-THUMB-LABEL: test_shift15_and510:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    ldrb r0, [r0, #2]
+; CHECK-THUMB-NEXT:    lsls r0, r0, #1
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ALIGN-LABEL: test_shift15_and510:
+; CHECK-ALIGN:       @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT:    ldrb r0, [r0, #2]
+; CHECK-ALIGN-NEXT:    lsls r0, r0, #1
+; CHECK-ALIGN-NEXT:    bx lr
+;
+; CHECK-V6M-LABEL: test_shift15_and510:
+; CHECK-V6M:       @ %bb.0: @ %entry
+; CHECK-V6M-NEXT:    ldrb r0, [r0, #2]
+; CHECK-V6M-NEXT:    lsls r0, r0, #1
+; CHECK-V6M-NEXT:    bx lr
+entry:
+  %load = load i32, ptr %p, align 4
+  %lshr = lshr i32 %load, 15
+  %and = and i32 %lshr, 510
+  ret i32 %and
+}
+
+define arm_aapcscc i32 @test_shift22_and1020(ptr nocapture %p) {
+; CHECK-ARM-LABEL: test_shift22_and1020:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldrb r0, [r0, #3]
+; CHECK-ARM-NEXT:    lsl r0, r0, #2
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_shift22_and1020:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0]
+; CHECK-BE-NEXT:    lsl r0, r0, #2
+; CHECK-BE-NEXT:    bx lr
+;
+; CHECK-THUMB-LABEL: test_shift22_and1020:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    ldrb r0, [r0, #3]
+; CHECK-THUMB-NEXT:    lsls r0, r0, #2
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ALIGN-LABEL: test_shift22_and1020:
+; CHECK-ALIGN:       @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT:    ldrb r0, [r0, #3]
+; CHECK-ALIGN-NEXT:    lsls r0, r0, #2
+; CHECK-ALIGN-NEXT:    bx lr
+;
+; CHECK-V6M-LABEL: test_shift22_and1020:
+; CHECK-V6M:       @ %bb.0: @ %entry
+; CHECK-V6M-NEXT:    ldrb r0, [r0, #3]
+; CHECK-V6M-NEXT:    lsls r0, r0, #2
+; CHECK-V6M-NEXT:    bx lr
+entry:
+  %load = load i32, ptr %p, align 4
+  %lshr = lshr i32 %load, 22
+  %and = and i32 %lshr, 1020
+  ret i32 %and
+}
+
+define arm_aapcscc i32 @test_zext_shift5_and2040(ptr nocapture %p) {
+; CHECK-ARM-LABEL: test_zext_shift5_and2040:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldrb r0, [r0, #1]
+; CHECK-ARM-NEXT:    lsl r0, r0, #3
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-BE-LABEL: test_zext_shift5_and2040:
+; CHECK-BE:       @ %bb.0: @ %entry
+; CHECK-BE-NEXT:    ldrb r0, [r0]
+; CHECK-BE-NEXT:    lsl r0, r0, #3
+; CHECK-BE-NEXT:    bx lr
+;
+; CHECK-THUMB-LABEL: test_zext_shift5_and2040:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    ldrb r0, [r0, #1]
+; CHECK-THUMB-NEXT:    lsls r0, r0, #3
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ALIGN-LABEL: test_zext_shift5_and2040:
+; CHECK-ALIGN:       @ %bb.0: @ %entry
+; CHECK-ALIGN-NEXT:    ldrb r0, [r0, #1]
+; CHECK-ALIGN-NEXT:    lsls r0, r0, #3
+; CHECK-ALIGN-NEXT:    bx lr
+;
+; CHECK-V6M-LABEL: test_zext_shift5_and2040:
+; CHECK-V6M:       @ %bb.0: @ %entry
+; CHECK-V6M-NEXT:    ldrb r0, [r0, #1]
+; CHECK-V6M-NEXT:    lsls r0, r0, #3
+; CHECK-V6M-NEXT:    bx lr
+entry:
+  %load = load i16, ptr %p, align 2
+  %zext = zext i16 %load to i32
+  %lshr = lshr i32 %zext, 5
+  %and = and i32 %lshr, 2040
+  ret i32 %and
+}
diff --git a/llvm/test/CodeGen/X86/h-registers-2.ll b/llvm/test/CodeGen/X86/h-registers-2.ll
index 451540524798c..5c42c97e7a43e 100644
--- a/llvm/test/CodeGen/X86/h-registers-2.ll
+++ b/llvm/test/CodeGen/X86/h-registers-2.ll
@@ -9,12 +9,14 @@ define i32 @foo(ptr %x, i32 %y) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    imull %eax, %eax
 ; CHECK-NEXT:    movzbl %ah, %eax
 ; CHECK-NEXT:    movb $77, (%ecx,%eax,8)
 ; CHECK-NEXT:    shll $3, %eax
 ; CHECK-NEXT:    retl
 
-	%t0 = lshr i32 %y, 8		; <i32> [#uses=1]
+	%t4 = mul i32 %y, %y
+	%t0 = lshr i32 %t4, 8		; <i32> [#uses=1]
 	%t1 = and i32 %t0, 255		; <i32> [#uses=2]
   %t2 = shl i32 %t1, 3
 	%t3 = getelementptr i8, ptr %x, i32 %t2		; <ptr> [#uses=1]

``````````

</details>


https://github.com/llvm/llvm-project/pull/80342