[llvm] 77fccb3 - [AArch64] Replace AND with LSL#2 for LDR target (#34101) (#89531)

Sat Aug 24 12:30:43 PDT 2024

Author: hanbeom
Date: 2024-08-24T20:30:40+01:00
New Revision: 77fccb35ac08f66d52bb152735e27572bf9f3f93

URL: https://github.com/llvm/llvm-project/commit/77fccb35ac08f66d52bb152735e27572bf9f3f93
DIFF: https://github.com/llvm/llvm-project/commit/77fccb35ac08f66d52bb152735e27572bf9f3f93.diff

LOG: [AArch64] Replace AND with LSL#2 for LDR target (#34101) (#89531)

Currently, process of replacing bitwise operations consisting of
`LSR`/`LSL` with `And` is performed by `DAGCombiner`.

However, in certain cases, the `AND` generated by this process
can be removed.

Consider following case:
```
        lsr x8, x8, #56
        and x8, x8, #0xfc
        ldr w0, [x2, x8]
        ret
```

In this case, we can remove the `AND` by changing the target of `LDR`
to `[X2, X8, LSL #2]` and right-shifting amount change to 56 to 58.

after changed:
```
        lsr x8, x8, #58
        ldr w0, [x2, x8, lsl #2]
        ret
```

This patch checks to see if the `SHIFTING` + `AND` operation on load
target can be optimized and optimizes it if it can.

Added: 
    llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8c2f85657ff87e..5ac5b7f8a5ab18 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18023,6 +18023,23 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
     return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
   }
 
+  // We do not need to fold when this shifting used in specific load case:
+  // (ldr x, (add x, (shl (srl x, c1) 2)))
+  if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
+    if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      unsigned ShlAmt = C2->getZExtValue();
+      if (auto ShouldADD = *N->use_begin();
+          ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
+        if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->use_begin())) {
+          unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
+          if ((1ULL << ShlAmt) == ByteVT &&
+              isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
+            return false;
+        }
+      }
+    }
+  }
+
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll
new file mode 100644
index 00000000000000..9dfc8df703ce64
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-fold-lshr.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+;
+
+define i16 @load16_shr63(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load16_shr63:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #63
+; CHECK-NEXT:    ldrh w0, [x2, x8, lsl #1]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 63
+  %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr
+  %0 = load i16, ptr %arrayidx, align 2
+  ret i16 %0
+}
+
+define i16 @load16_shr2(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load16_shr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #2
+; CHECK-NEXT:    ldrh w0, [x2, x8, lsl #1]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 2
+  %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr
+  %0 = load i16, ptr %arrayidx, align 2
+  ret i16 %0
+}
+
+define i16 @load16_shr1(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load16_shr1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #1
+; CHECK-NEXT:    ldrh w0, [x2, x8, lsl #1]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 1
+  %arrayidx = getelementptr inbounds i16, ptr %table, i64 %shr
+  %0 = load i16, ptr %arrayidx, align 2
+  ret i16 %0
+}
+
+define i32 @load32_shr63(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load32_shr63:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #63
+; CHECK-NEXT:    ldr w0, [x2, x8, lsl #2]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 63
+  %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr
+  %0 = load i32, ptr %arrayidx, align 4
+  ret i32 %0
+}
+
+define i32 @load32_shr2(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load32_shr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #2
+; CHECK-NEXT:    ldr w0, [x2, x8, lsl #2]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 2
+  %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr
+  %0 = load i32, ptr %arrayidx, align 4
+  ret i32 %0
+}
+
+define i32 @load32_shr1(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load32_shr1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #1
+; CHECK-NEXT:    ldr w0, [x2, x8, lsl #2]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 1
+  %arrayidx = getelementptr inbounds i32, ptr %table, i64 %shr
+  %0 = load i32, ptr %arrayidx, align 4
+  ret i32 %0
+}
+
+define i64 @load64_shr63(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load64_shr63:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #63
+; CHECK-NEXT:    ldr x0, [x2, x8, lsl #3]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 63
+  %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr
+  %0 = load i64, ptr %arrayidx, align 8
+  ret i64 %0
+}
+
+define i64 @load64_shr2(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load64_shr2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #2
+; CHECK-NEXT:    ldr x0, [x2, x8, lsl #3]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 2
+  %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr
+  %0 = load i64, ptr %arrayidx, align 8
+  ret i64 %0
+}
+
+define i64 @load64_shr1(i64 %a, i64 %b, ptr %table) {
+; CHECK-LABEL: load64_shr1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul x8, x1, x0
+; CHECK-NEXT:    lsr x8, x8, #1
+; CHECK-NEXT:    ldr x0, [x2, x8, lsl #3]
+; CHECK-NEXT:    ret
+entry:
+  %mul = mul i64 %b, %a
+  %shr = lshr i64 %mul, 1
+  %arrayidx = getelementptr inbounds i64, ptr %table, i64 %shr
+  %0 = load i64, ptr %arrayidx, align 8
+  ret i64 %0
+}