[llvm] [AArch64] Combine and and lsl into ubfiz (PR #118974)

Wed Jan 8 06:58:26 PST 2025

https://github.com/c-rhodes updated https://github.com/llvm/llvm-project/pull/118974

>From c41f6655d5c3e8fcf476fbb3c9181ae72dfda84f Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Fri, 6 Dec 2024 13:46:58 +0000
Subject: [PATCH 1/6] [AArch64] Combine and and lsl into ubfiz

Fixes #118132.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td      |  9 +++++++++
 .../test/CodeGen/AArch64/aarch64-fold-lslfast.ll | 10 ++++------
 llvm/test/CodeGen/AArch64/xbfiz.ll               | 16 ++++++++++++++++
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c6f5cdcd1d5fe7..6acac914dbbba6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8968,6 +8968,15 @@ def : Pat<(shl (i64 (zext GPR32:$Rn)), (i64 imm0_63:$imm)),
                    (i64 (i64shift_a        imm0_63:$imm)),
                    (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
 
+def : Pat<(shl (i64 (and (i64 (anyext GPR32:$Rn)), 0xff)), (i64 imm0_63:$imm)),
+          (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+def : Pat<(shl (i64 (and (i64 (anyext GPR32:$Rn)), 0xffff)), (i64 imm0_63:$imm)),
+          (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
 // sra patterns have an AddedComplexity of 10, so make sure we have a higher
 // AddedComplexity for the following patterns since we want to match sext + sra
 // patterns before we attempt to match a single sra node.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 63dcafed2320a0..abc5c0876e80b7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -13,11 +13,10 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 ; CHECK0-SDAG-LABEL: halfword:
 ; CHECK0-SDAG:       // %bb.0:
 ; CHECK0-SDAG-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-SDAG-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-SDAG-NEXT:    ubfx x8, x1, #9, #8
+; CHECK0-SDAG-NEXT:    lsr w8, w1, #9
 ; CHECK0-SDAG-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK0-SDAG-NEXT:    mov x19, x0
-; CHECK0-SDAG-NEXT:    lsl x21, x8, #1
+; CHECK0-SDAG-NEXT:    ubfiz x21, x8, #1, #8
 ; CHECK0-SDAG-NEXT:    ldrh w20, [x0, x21]
 ; CHECK0-SDAG-NEXT:    bl foo
 ; CHECK0-SDAG-NEXT:    mov w0, w20
@@ -231,10 +230,9 @@ define i16 @multi_use_half_word(ptr %ctx, i32 %xor72) {
 ; CHECK0-SDAG-NEXT:    .cfi_offset w21, -24
 ; CHECK0-SDAG-NEXT:    .cfi_offset w22, -32
 ; CHECK0-SDAG-NEXT:    .cfi_offset w30, -48
-; CHECK0-SDAG-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK0-SDAG-NEXT:    ubfx x8, x1, #9, #8
+; CHECK0-SDAG-NEXT:    lsr w8, w1, #9
 ; CHECK0-SDAG-NEXT:    mov x19, x0
-; CHECK0-SDAG-NEXT:    lsl x21, x8, #1
+; CHECK0-SDAG-NEXT:    ubfiz x21, x8, #1, #8
 ; CHECK0-SDAG-NEXT:    ldrh w20, [x0, x21]
 ; CHECK0-SDAG-NEXT:    add w22, w20, #1
 ; CHECK0-SDAG-NEXT:    bl foo
diff --git a/llvm/test/CodeGen/AArch64/xbfiz.ll b/llvm/test/CodeGen/AArch64/xbfiz.ll
index b777ddcb7efcc4..05567e34258402 100644
--- a/llvm/test/CodeGen/AArch64/xbfiz.ll
+++ b/llvm/test/CodeGen/AArch64/xbfiz.ll
@@ -69,3 +69,19 @@ define i64 @lsl32_not_ubfiz64(i64 %v) {
   %and = and i64 %shl, 4294967295
   ret i64 %and
 }
+
+define i64 @lsl_zext_i8_i64(i8 %b) {
+; CHECK-LABEL: lsl_zext_i8_i64:
+; CHECK:    ubfiz x0, x0, #1, #8
+  %1 = zext i8 %b to i64
+  %2 = shl i64 %1, 1
+  ret i64 %2
+}
+
+define i64 @lsl_zext_i16_i64(i16 %b) {
+; CHECK-LABEL: lsl_zext_i16_i64:
+; CHECK:    ubfiz x0, x0, #1, #16
+  %1 = zext i16 %b to i64
+  %2 = shl i64 %1, 1
+  ret i64 %2
+}

>From 50993210203ac2070a3dbea27258c10629ef71e7 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Fri, 13 Dec 2024 12:30:25 +0000
Subject: [PATCH 2/6] Move to target DAG-combine

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  34 ++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   9 --
 .../CodeGen/AArch64/aarch64-fold-lslfast.ll   |  10 +-
 .../AArch64/const-shift-of-constmasked.ll     | 101 ++++++++----------
 llvm/test/CodeGen/AArch64/extract-bits.ll     |  16 +--
 llvm/test/CodeGen/AArch64/fpenv.ll            |   6 +-
 .../CodeGen/AArch64/swap-compare-operands.ll  |  42 +++++---
 7 files changed, 123 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3ad2905ce52076..5c6b04d637b5c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1140,6 +1140,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
 
+  setTargetDAGCombine(ISD::SHL);
+
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemset =
@@ -26365,6 +26367,36 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // If the operand is a bitwise AND with a constant RHS, and the shift is the
+  // only use, we can pull it out of the shift.
+  //
+  // (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
+  if (!Op0.hasOneUse() || Op0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+  ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(Op1);
+  if (!C1 || !C2)
+    return SDValue();
+
+  // Might be folded into shifted add/sub, do not lower.
+  if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
+                         N->use_begin()->getOpcode() == ISD::SUB))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, Op0.getOperand(1), Op1);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, Op0->getOperand(0), Op1);
+  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -26710,6 +26742,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCTLZCombine(N, DAG, Subtarget);
   case ISD::SCALAR_TO_VECTOR:
     return performScalarToVectorCombine(N, DCI, DAG);
+  case ISD::SHL:
+    return performSHLCombine(N, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6acac914dbbba6..c6f5cdcd1d5fe7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8968,15 +8968,6 @@ def : Pat<(shl (i64 (zext GPR32:$Rn)), (i64 imm0_63:$imm)),
                    (i64 (i64shift_a        imm0_63:$imm)),
                    (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
 
-def : Pat<(shl (i64 (and (i64 (anyext GPR32:$Rn)), 0xff)), (i64 imm0_63:$imm)),
-          (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 (i64shift_a        imm0_63:$imm)),
-                   (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
-def : Pat<(shl (i64 (and (i64 (anyext GPR32:$Rn)), 0xffff)), (i64 imm0_63:$imm)),
-          (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 (i64shift_a        imm0_63:$imm)),
-                   (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
-
 // sra patterns have an AddedComplexity of 10, so make sure we have a higher
 // AddedComplexity for the following patterns since we want to match sext + sra
 // patterns before we attempt to match a single sra node.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index abc5c0876e80b7..63dcafed2320a0 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -13,10 +13,11 @@ define i16 @halfword(ptr %ctx, i32 %xor72) nounwind {
 ; CHECK0-SDAG-LABEL: halfword:
 ; CHECK0-SDAG:       // %bb.0:
 ; CHECK0-SDAG-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK0-SDAG-NEXT:    lsr w8, w1, #9
+; CHECK0-SDAG-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK0-SDAG-NEXT:    ubfx x8, x1, #9, #8
 ; CHECK0-SDAG-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK0-SDAG-NEXT:    mov x19, x0
-; CHECK0-SDAG-NEXT:    ubfiz x21, x8, #1, #8
+; CHECK0-SDAG-NEXT:    lsl x21, x8, #1
 ; CHECK0-SDAG-NEXT:    ldrh w20, [x0, x21]
 ; CHECK0-SDAG-NEXT:    bl foo
 ; CHECK0-SDAG-NEXT:    mov w0, w20
@@ -230,9 +231,10 @@ define i16 @multi_use_half_word(ptr %ctx, i32 %xor72) {
 ; CHECK0-SDAG-NEXT:    .cfi_offset w21, -24
 ; CHECK0-SDAG-NEXT:    .cfi_offset w22, -32
 ; CHECK0-SDAG-NEXT:    .cfi_offset w30, -48
-; CHECK0-SDAG-NEXT:    lsr w8, w1, #9
+; CHECK0-SDAG-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK0-SDAG-NEXT:    ubfx x8, x1, #9, #8
 ; CHECK0-SDAG-NEXT:    mov x19, x0
-; CHECK0-SDAG-NEXT:    ubfiz x21, x8, #1, #8
+; CHECK0-SDAG-NEXT:    lsl x21, x8, #1
 ; CHECK0-SDAG-NEXT:    ldrh w20, [x0, x21]
 ; CHECK0-SDAG-NEXT:    add w22, w20, #1
 ; CHECK0-SDAG-NEXT:    bl foo
diff --git a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
index 66a6745cda8f76..1fffcdda4b4167 100644
--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
@@ -190,8 +190,7 @@ define i8 @test_i8_224_mask_ashr_6(i8 %a0) {
 define i8 @test_i8_7_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_7_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    ubfiz w0, w0, #1, #3
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 7
   %t1 = shl i8 %t0, 1
@@ -200,8 +199,7 @@ define i8 @test_i8_7_mask_shl_1(i8 %a0) {
 define i8 @test_i8_7_mask_shl_4(i8 %a0) {
 ; CHECK-LABEL: test_i8_7_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7
-; CHECK-NEXT:    lsl w0, w8, #4
+; CHECK-NEXT:    ubfiz w0, w0, #4, #3
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 7
   %t1 = shl i8 %t0, 4
@@ -229,8 +227,8 @@ define i8 @test_i8_7_mask_shl_6(i8 %a0) {
 define i8 @test_i8_28_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1c
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0x38
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 1
@@ -239,8 +237,8 @@ define i8 @test_i8_28_mask_shl_1(i8 %a0) {
 define i8 @test_i8_28_mask_shl_2(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1c
-; CHECK-NEXT:    lsl w0, w8, #2
+; CHECK-NEXT:    lsl w8, w0, #2
+; CHECK-NEXT:    and w0, w8, #0x70
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 2
@@ -249,8 +247,8 @@ define i8 @test_i8_28_mask_shl_2(i8 %a0) {
 define i8 @test_i8_28_mask_shl_3(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1c
-; CHECK-NEXT:    lsl w0, w8, #3
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    and w0, w8, #0xe0
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 3
@@ -259,8 +257,8 @@ define i8 @test_i8_28_mask_shl_3(i8 %a0) {
 define i8 @test_i8_28_mask_shl_4(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xc
-; CHECK-NEXT:    lsl w0, w8, #4
+; CHECK-NEXT:    lsl w8, w0, #4
+; CHECK-NEXT:    and w0, w8, #0xc0
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 4
@@ -270,8 +268,8 @@ define i8 @test_i8_28_mask_shl_4(i8 %a0) {
 define i8 @test_i8_224_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_224_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x60
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0xc0
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 224
   %t1 = shl i8 %t0, 1
@@ -465,8 +463,7 @@ define i16 @test_i16_65024_mask_ashr_10(i16 %a0) {
 define i16 @test_i16_127_mask_shl_1(i16 %a0) {
 ; CHECK-LABEL: test_i16_127_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    ubfiz w0, w0, #1, #7
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 127
   %t1 = shl i16 %t0, 1
@@ -475,8 +472,7 @@ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
 define i16 @test_i16_127_mask_shl_8(i16 %a0) {
 ; CHECK-LABEL: test_i16_127_mask_shl_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f
-; CHECK-NEXT:    lsl w0, w8, #8
+; CHECK-NEXT:    ubfiz w0, w0, #8, #7
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 127
   %t1 = shl i16 %t0, 8
@@ -504,8 +500,8 @@ define i16 @test_i16_127_mask_shl_10(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f0
-; CHECK-NEXT:    lsl w0, w8, #3
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    and w0, w8, #0x3f80
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 3
@@ -514,8 +510,8 @@ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f0
-; CHECK-NEXT:    lsl w0, w8, #4
+; CHECK-NEXT:    lsl w8, w0, #4
+; CHECK-NEXT:    and w0, w8, #0x7f00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 4
@@ -524,8 +520,8 @@ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f0
-; CHECK-NEXT:    lsl w0, w8, #5
+; CHECK-NEXT:    lsl w8, w0, #5
+; CHECK-NEXT:    and w0, w8, #0xfe00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 5
@@ -534,8 +530,8 @@ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_6:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x3f0
-; CHECK-NEXT:    lsl w0, w8, #6
+; CHECK-NEXT:    lsl w8, w0, #6
+; CHECK-NEXT:    and w0, w8, #0xfc00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 6
@@ -545,8 +541,8 @@ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
 define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
 ; CHECK-LABEL: test_i16_65024_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7e00
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0xfc00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 65024
   %t1 = shl i16 %t0, 1
@@ -740,8 +736,7 @@ define i32 @test_i32_4294836224_mask_ashr_18(i32 %a0) {
 define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
 ; CHECK-LABEL: test_i32_32767_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    ubfiz w0, w0, #1, #15
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 32767
   %t1 = shl i32 %t0, 1
@@ -750,8 +745,7 @@ define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
 define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
 ; CHECK-LABEL: test_i32_32767_mask_shl_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff
-; CHECK-NEXT:    lsl w0, w8, #16
+; CHECK-NEXT:    ubfiz w0, w0, #16, #15
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 32767
   %t1 = shl i32 %t0, 16
@@ -779,8 +773,8 @@ define i32 @test_i32_32767_mask_shl_18(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_7:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff00
-; CHECK-NEXT:    lsl w0, w8, #7
+; CHECK-NEXT:    lsl w8, w0, #7
+; CHECK-NEXT:    and w0, w8, #0x3fff8000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 7
@@ -789,8 +783,8 @@ define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff00
-; CHECK-NEXT:    lsl w0, w8, #8
+; CHECK-NEXT:    lsl w8, w0, #8
+; CHECK-NEXT:    and w0, w8, #0x7fff0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 8
@@ -799,8 +793,8 @@ define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_9:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff00
-; CHECK-NEXT:    lsl w0, w8, #9
+; CHECK-NEXT:    lsl w8, w0, #9
+; CHECK-NEXT:    and w0, w8, #0xfffe0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 9
@@ -809,8 +803,8 @@ define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x3fff00
-; CHECK-NEXT:    lsl w0, w8, #10
+; CHECK-NEXT:    lsl w8, w0, #10
+; CHECK-NEXT:    and w0, w8, #0xfffc0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 10
@@ -820,8 +814,8 @@ define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
 define i32 @test_i32_4294836224_mask_shl_1(i32 %a0) {
 ; CHECK-LABEL: test_i32_4294836224_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7ffe0000
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0xfffc0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 4294836224
   %t1 = shl i32 %t0, 1
@@ -1015,8 +1009,7 @@ define i64 @test_i64_18446744065119617024_mask_ashr_34(i64 %a0) {
 define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
 ; CHECK-LABEL: test_i64_2147483647_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff
-; CHECK-NEXT:    lsl x0, x8, #1
+; CHECK-NEXT:    lsl w0, w0, #1
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 2147483647
   %t1 = shl i64 %t0, 1
@@ -1054,8 +1047,8 @@ define i64 @test_i64_2147483647_mask_shl_34(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #15
+; CHECK-NEXT:    lsl x8, x0, #15
+; CHECK-NEXT:    and x0, x8, #0x3fffffff80000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 15
@@ -1064,8 +1057,8 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #16
+; CHECK-NEXT:    lsl x8, x0, #16
+; CHECK-NEXT:    and x0, x8, #0x7fffffff00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 16
@@ -1074,8 +1067,8 @@ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_17:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #17
+; CHECK-NEXT:    lsl x8, x0, #17
+; CHECK-NEXT:    and x0, x8, #0xfffffffe00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 17
@@ -1084,8 +1077,8 @@ define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_18:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x3fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #18
+; CHECK-NEXT:    lsl x8, x0, #18
+; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 18
@@ -1095,8 +1088,8 @@ define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
 define i64 @test_i64_18446744065119617024_mask_shl_1(i64 %a0) {
 ; CHECK-LABEL: test_i64_18446744065119617024_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7ffffffe00000000
-; CHECK-NEXT:    lsl x0, x8, #1
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 18446744065119617024
   %t1 = shl i64 %t0, 1
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index b87157a183835d..aaa6c7eb4a30f4 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -1013,8 +1013,8 @@ define i32 @c1_i32(i32 %arg) nounwind {
 define i32 @c2_i32(i32 %arg) nounwind {
 ; CHECK-LABEL: c2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx w8, w0, #19, #10
-; CHECK-NEXT:    lsl w0, w8, #2
+; CHECK-NEXT:    lsr w8, w0, #17
+; CHECK-NEXT:    and w0, w8, #0xffc
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i32 %arg, 19
   %tmp1 = and i32 %tmp0, 1023
@@ -1063,8 +1063,8 @@ define i64 @c1_i64(i64 %arg) nounwind {
 define i64 @c2_i64(i64 %arg) nounwind {
 ; CHECK-LABEL: c2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx x8, x0, #51, #10
-; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    lsr x8, x0, #49
+; CHECK-NEXT:    and x0, x8, #0xffc
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i64 %arg, 51
   %tmp1 = and i64 %tmp0, 1023
@@ -1120,8 +1120,8 @@ define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
 define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
 ; CHECK-LABEL: c7_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx w8, w0, #19, #10
-; CHECK-NEXT:    lsl w8, w8, #2
+; CHECK-NEXT:    lsr w8, w0, #17
+; CHECK-NEXT:    and w8, w8, #0xffc
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i32 %arg, 19
@@ -1163,8 +1163,8 @@ define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
 define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
 ; CHECK-LABEL: c7_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx x8, x0, #51, #10
-; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    lsr x8, x0, #49
+; CHECK-NEXT:    and x8, x8, #0xffc
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i64 %arg, 51
diff --git a/llvm/test/CodeGen/AArch64/fpenv.ll b/llvm/test/CodeGen/AArch64/fpenv.ll
index 3a307f7731037a..3351565d8dd89d 100644
--- a/llvm/test/CodeGen/AArch64/fpenv.ll
+++ b/llvm/test/CodeGen/AArch64/fpenv.ll
@@ -4,11 +4,11 @@
 define void @func_set_rounding_dyn(i32 %rm) {
 ; CHECK-LABEL: func_set_rounding_dyn:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub w9, w0, #1
+; CHECK-NEXT:    lsl w9, w0, #22
 ; CHECK-NEXT:    mrs x8, FPCR
-; CHECK-NEXT:    and w9, w9, #0x3
 ; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
-; CHECK-NEXT:    lsl w9, w9, #22
+; CHECK-NEXT:    sub w9, w9, #1024, lsl #12 // =4194304
+; CHECK-NEXT:    and w9, w9, #0xc00000
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    msr FPCR, x8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/swap-compare-operands.ll b/llvm/test/CodeGen/AArch64/swap-compare-operands.ll
index b106e15c23e30a..a45881f2034b21 100644
--- a/llvm/test/CodeGen/AArch64/swap-compare-operands.ll
+++ b/llvm/test/CodeGen/AArch64/swap-compare-operands.ll
@@ -133,8 +133,9 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend16_64(i16 %a, i64 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend16_64
-; CHECK:      cmp    x1, w0, uxth #2
-; CHECK-NEXT: cset   w0, lo
+; CHECK:      ubfiz  x8, x0, #2, #16
+; CHECK:      cmp    x8, x1
+; CHECK-NEXT: cset   w0, hi
 entry:
   %a64 = zext i16 %a to i64
   %shl.0 = shl i64 %a64, 2
@@ -144,8 +145,9 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend8_64(i8 %a, i64 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend8_64
-; CHECK:      cmp    x1, w0, uxtb #4
-; CHECK-NEXT: cset    w0, lo
+; CHECK:      ubfiz  x8, x0, #4, #8
+; CHECK:      cmp    x8, x1
+; CHECK-NEXT: cset    w0, hi
 entry:
   %a64 = zext i8 %a to i64
   %shl.2 = shl i64 %a64, 4
@@ -155,8 +157,9 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend16_32(i16 %a, i32 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend16_32
-; CHECK:      cmp    w1, w0, uxth #3
-; CHECK-NEXT: cset    w0, lo
+; CHECK:      ubfiz  w8, w0, #3, #16
+; CHECK:      cmp    w8, w1
+; CHECK-NEXT: cset    w0, hi
 entry:
   %a32 = zext i16 %a to i32
   %shl = shl i32 %a32, 3
@@ -166,8 +169,9 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend8_32(i8 %a, i32 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend8_32
-; CHECK:      cmp    w1, w0, uxtb #4
-; CHECK-NEXT: cset    w0, lo
+; CHECK:      ubfiz  w8, w0, #4, #8
+; CHECK:      cmp    w8, w1
+; CHECK-NEXT: cset    w0, hi
 entry:
   %a32 = zext i8 %a to i32
   %shl = shl i32 %a32, 4
@@ -177,9 +181,9 @@ entry:
 
 define i1 @testSwapCmpWithTooLargeShiftedZeroExtend8_32(i8 %a, i32 %b) {
 ; CHECK-LABEL: testSwapCmpWithTooLargeShiftedZeroExtend8_32
-; CHECK:      and    [[REG:w[0-9]+]], w0, #0xff
-; CHECK:      cmp    w1, [[REG]], lsl #5
-; CHECK-NEXT: cset   w0, lo
+; CHECK:      ubfiz  w8, w0, #5, #8
+; CHECK:      cmp    w8, w1
+; CHECK-NEXT: cset   w0, hi
 entry:
   %a32 = zext i8 %a to i32
   %shl = shl i32 %a32, 5
@@ -517,7 +521,8 @@ t1:
   %shl1 = shl i64 %conv1, 4
   %na1 = sub i64 0, %shl1
   %cmp1 = icmp ne i64 %na1, %b64
-; CHECK: cmn    x3, w1, uxth #4
+; CHECK: ubfiz  x8, x1, #4, #16
+; CHECK: cmn    x3, x8
   br i1 %cmp1, label %t2, label %end
 
 t2:
@@ -525,7 +530,8 @@ t2:
   %shl2 = shl i64 %conv2, 3
   %na2 = sub i64 0, %shl2
   %cmp2 = icmp ne i64 %na2, %b64
-; CHECK: cmn    x3, w2, uxtb #3
+; CHECK: ubfiz  x8, x2, #3, #8
+; CHECK: cmn    x3, x8
   br i1 %cmp2, label %t3, label %end
 
 t3:
@@ -533,7 +539,8 @@ t3:
   %shl3 = shl i32 %conv3, 2
   %na3 = sub i32 0, %shl3
   %cmp3 = icmp ne i32 %na3, %b32
-; CHECK: cmn    w4, w1, uxth #2
+; CHECK: ubfiz  w8, w1, #2, #16
+; CHECK: cmn    w4, w8
   br i1 %cmp3, label %t4, label %end
 
 t4:
@@ -541,7 +548,8 @@ t4:
   %shl4 = shl i32 %conv4, 1
   %na4 = sub i32 0, %shl4
   %cmp4 = icmp ne i32 %na4, %b32
-; CHECK: cmn    w4, w2, uxtb #1
+; CHECK: ubfiz  w8, w2, #1, #8
+; CHECK: cmn    w4, w8
   br i1 %cmp4, label %t5, label %end
 
 t5:
@@ -549,8 +557,8 @@ t5:
   %shl5 = shl i32 %conv5, 5
   %na5 = sub i32 0, %shl5
   %cmp5 = icmp ne i32 %na5, %b32
-; CHECK: and    [[REG:w[0-9]+]], w2, #0xff
-; CHECK: cmn    w4, [[REG]], lsl #5
+; CHECK: ubfiz  w8, w2, #5, #8
+; CHECK: cmn    w4, w8
   br i1 %cmp5, label %t6, label %end
 
 t6:

>From 69be8118952f76baf2accd695aee14121ab67b59 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Fri, 13 Dec 2024 14:44:37 +0000
Subject: [PATCH 3/6] Exclude more uses of SHL that might be combined

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  8 ++--
 .../CodeGen/AArch64/swap-compare-operands.ll  | 42 ++++++++-----------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5c6b04d637b5c4..0cae5a536b6f39 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26386,9 +26386,11 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG) {
   if (!C1 || !C2)
     return SDValue();
 
-  // Might be folded into shifted add/sub, do not lower.
-  if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
-                         N->use_begin()->getOpcode() == ISD::SUB))
+  // Might be folded into shifted op, do not lower.
+  unsigned UseOpc = N->use_begin()->getOpcode();
+  if (N->hasOneUse() &&
+      (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
+       UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS))
     return SDValue();
 
   SDLoc DL(N);
diff --git a/llvm/test/CodeGen/AArch64/swap-compare-operands.ll b/llvm/test/CodeGen/AArch64/swap-compare-operands.ll
index a45881f2034b21..b106e15c23e30a 100644
--- a/llvm/test/CodeGen/AArch64/swap-compare-operands.ll
+++ b/llvm/test/CodeGen/AArch64/swap-compare-operands.ll
@@ -133,9 +133,8 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend16_64(i16 %a, i64 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend16_64
-; CHECK:      ubfiz  x8, x0, #2, #16
-; CHECK:      cmp    x8, x1
-; CHECK-NEXT: cset   w0, hi
+; CHECK:      cmp    x1, w0, uxth #2
+; CHECK-NEXT: cset   w0, lo
 entry:
   %a64 = zext i16 %a to i64
   %shl.0 = shl i64 %a64, 2
@@ -145,9 +144,8 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend8_64(i8 %a, i64 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend8_64
-; CHECK:      ubfiz  x8, x0, #4, #8
-; CHECK:      cmp    x8, x1
-; CHECK-NEXT: cset    w0, hi
+; CHECK:      cmp    x1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
 entry:
   %a64 = zext i8 %a to i64
   %shl.2 = shl i64 %a64, 4
@@ -157,9 +155,8 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend16_32(i16 %a, i32 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend16_32
-; CHECK:      ubfiz  w8, w0, #3, #16
-; CHECK:      cmp    w8, w1
-; CHECK-NEXT: cset    w0, hi
+; CHECK:      cmp    w1, w0, uxth #3
+; CHECK-NEXT: cset    w0, lo
 entry:
   %a32 = zext i16 %a to i32
   %shl = shl i32 %a32, 3
@@ -169,9 +166,8 @@ entry:
 
 define i1 @testSwapCmpWithShiftedZeroExtend8_32(i8 %a, i32 %b) {
 ; CHECK-LABEL: testSwapCmpWithShiftedZeroExtend8_32
-; CHECK:      ubfiz  w8, w0, #4, #8
-; CHECK:      cmp    w8, w1
-; CHECK-NEXT: cset    w0, hi
+; CHECK:      cmp    w1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
 entry:
   %a32 = zext i8 %a to i32
   %shl = shl i32 %a32, 4
@@ -181,9 +177,9 @@ entry:
 
 define i1 @testSwapCmpWithTooLargeShiftedZeroExtend8_32(i8 %a, i32 %b) {
 ; CHECK-LABEL: testSwapCmpWithTooLargeShiftedZeroExtend8_32
-; CHECK:      ubfiz  w8, w0, #5, #8
-; CHECK:      cmp    w8, w1
-; CHECK-NEXT: cset   w0, hi
+; CHECK:      and    [[REG:w[0-9]+]], w0, #0xff
+; CHECK:      cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
 entry:
   %a32 = zext i8 %a to i32
   %shl = shl i32 %a32, 5
@@ -521,8 +517,7 @@ t1:
   %shl1 = shl i64 %conv1, 4
   %na1 = sub i64 0, %shl1
   %cmp1 = icmp ne i64 %na1, %b64
-; CHECK: ubfiz  x8, x1, #4, #16
-; CHECK: cmn    x3, x8
+; CHECK: cmn    x3, w1, uxth #4
   br i1 %cmp1, label %t2, label %end
 
 t2:
@@ -530,8 +525,7 @@ t2:
   %shl2 = shl i64 %conv2, 3
   %na2 = sub i64 0, %shl2
   %cmp2 = icmp ne i64 %na2, %b64
-; CHECK: ubfiz  x8, x2, #3, #8
-; CHECK: cmn    x3, x8
+; CHECK: cmn    x3, w2, uxtb #3
   br i1 %cmp2, label %t3, label %end
 
 t3:
@@ -539,8 +533,7 @@ t3:
   %shl3 = shl i32 %conv3, 2
   %na3 = sub i32 0, %shl3
   %cmp3 = icmp ne i32 %na3, %b32
-; CHECK: ubfiz  w8, w1, #2, #16
-; CHECK: cmn    w4, w8
+; CHECK: cmn    w4, w1, uxth #2
   br i1 %cmp3, label %t4, label %end
 
 t4:
@@ -548,8 +541,7 @@ t4:
   %shl4 = shl i32 %conv4, 1
   %na4 = sub i32 0, %shl4
   %cmp4 = icmp ne i32 %na4, %b32
-; CHECK: ubfiz  w8, w2, #1, #8
-; CHECK: cmn    w4, w8
+; CHECK: cmn    w4, w2, uxtb #1
   br i1 %cmp4, label %t5, label %end
 
 t5:
@@ -557,8 +549,8 @@ t5:
   %shl5 = shl i32 %conv5, 5
   %na5 = sub i32 0, %shl5
   %cmp5 = icmp ne i32 %na5, %b32
-; CHECK: ubfiz  w8, w2, #5, #8
-; CHECK: cmn    w4, w8
+; CHECK: and    [[REG:w[0-9]+]], w2, #0xff
+; CHECK: cmn    w4, [[REG]], lsl #5
   br i1 %cmp5, label %t6, label %end
 
 t6:

>From fdbe823e14a1cf12a2358bcb141de5c469bc3b01 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Fri, 13 Dec 2024 16:50:25 +0000
Subject: [PATCH 4/6] address comments

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0cae5a536b6f39..13871b149c2b91 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26367,35 +26367,38 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
+/// If the operand is a bitwise AND with a constant RHS, and the shift has a
+/// constant RHS and is the only use, we can pull it out of the shift, i.e.
+///
+///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
+///
+/// We prefer this canonical form to match existing isel patterns.
 static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
-  // If the operand is a bitwise AND with a constant RHS, and the shift is the
-  // only use, we can pull it out of the shift.
-  //
-  // (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
+  SDValue Op0 = N->getOperand(0);
   if (!Op0.hasOneUse() || Op0.getOpcode() != ISD::AND)
     return SDValue();
 
-  ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
-  ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(Op1);
-  if (!C1 || !C2)
+  SDValue C1 = Op0->getOperand(1);
+  SDValue C2 = N->getOperand(1);
+  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
     return SDValue();
 
   // Might be folded into shifted op, do not lower.
-  unsigned UseOpc = N->use_begin()->getOpcode();
-  if (N->hasOneUse() &&
-      (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
-       UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS))
-    return SDValue();
+  if (N->hasOneUse()) {
+    unsigned UseOpc = N->use_begin()->getOpcode();
+    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
+        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
+      return SDValue();
+  }
 
   SDLoc DL(N);
-  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, Op0.getOperand(1), Op1);
-  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, Op0->getOperand(0), Op1);
+  SDValue X = Op0->getOperand(0);
+  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
   return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
 }
 

>From f0cbfdda1a8a0c09904969767078fcf7c5c9a9e5 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Wed, 8 Jan 2025 14:17:43 +0000
Subject: [PATCH 5/6] canonicalize after legalization

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 13871b149c2b91..fa13f8c6d513e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26373,9 +26373,10 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 ///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
 ///
 /// We prefer this canonical form to match existing isel patterns.
-static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::i32 && VT != MVT::i64)
+static SDValue performSHLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
@@ -26389,13 +26390,14 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG) {
 
   // Might be folded into shifted op, do not lower.
   if (N->hasOneUse()) {
-    unsigned UseOpc = N->use_begin()->getOpcode();
+    unsigned UseOpc = N->user_begin()->getOpcode();
     if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
         UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
       return SDValue();
   }
 
   SDLoc DL(N);
+  EVT VT = N->getValueType(0);
   SDValue X = Op0->getOperand(0);
   SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
@@ -26748,7 +26750,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SCALAR_TO_VECTOR:
     return performScalarToVectorCombine(N, DCI, DAG);
   case ISD::SHL:
-    return performSHLCombine(N, DAG);
+    return performSHLCombine(N, DCI, DAG);
   }
   return SDValue();
 }

>From ed9338a48b28a7e9cd8040354473be0c3ec7d675 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes at arm.com>
Date: Wed, 8 Jan 2025 14:56:37 +0000
Subject: [PATCH 6/6] address comments

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fa13f8c6d513e6..23671c9ffcf199 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26380,7 +26380,7 @@ static SDValue performSHLCombine(SDNode *N,
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
-  if (!Op0.hasOneUse() || Op0.getOpcode() != ISD::AND)
+  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
     return SDValue();
 
   SDValue C1 = Op0->getOperand(1);