[llvm] [SelectionDAG] Optimize BSWAP yet again once more (PR #165292)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 28 09:07:07 PDT 2025
https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/165292
>From fc95ffbd5cda9326366da42f5ed7e21036a09272 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sat, 25 Oct 2025 12:27:59 -0400
Subject: [PATCH 1/2] [ARM] Only change mask if demanded bits says we can
optimize
Also enable a switch to turn off enable-logical-imm.
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 99 +++++++++---
llvm/test/CodeGen/ARM/and-cmpz.ll | 2 +-
llvm/test/CodeGen/ARM/fpenv.ll | 4 +-
llvm/test/CodeGen/ARM/funnel-shift-rot.ll | 5 +-
...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 56 ++++---
.../CodeGen/ARM/illegal-bitfield-loadstore.ll | 10 +-
llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll | 84 ++++++----
llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll | 153 +++++++++++++-----
.../CodeGen/ARM/simplifysetcc_narrow_load.ll | 4 +-
.../CodeGen/ARM/urem-seteq-illegal-types.ll | 6 +-
llvm/test/CodeGen/Thumb/bic_imm.ll | 2 +-
llvm/test/CodeGen/Thumb2/active_lane_mask.ll | 2 +-
12 files changed, 286 insertions(+), 141 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 313ae3d68fb83..6c994f36c9833 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -118,6 +118,7 @@ using namespace llvm;
#define DEBUG_TYPE "arm-isel"
STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
STATISTIC(NumConstpoolPromoted,
@@ -142,6 +143,12 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
cl::init(128));
+static cl::opt<bool>
+ EnableOptimizeLogicalImm("arm-enable-logical-imm", cl::Hidden,
+ cl::desc("Enable ARM logical imm instruction "
+ "optimization"),
+ cl::init(true));
+
cl::opt<unsigned>
MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
cl::desc("Maximum interleave factor for MVE VLDn to generate."),
@@ -20138,6 +20145,16 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
}
}
+static bool isLegalLogicalImmediate(unsigned Imm,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->isThumb())
+ return ARM_AM::getSOImmVal(Imm) != -1;
+ if (Subtarget->isThumb2())
+ return ARM_AM::getT2SOImmVal(Imm) != -1;
+ // Thumb1 only has 8-bit unsigned immediate.
+ return Imm <= 255;
+}
+
bool ARMTargetLowering::targetShrinkDemandedConstant(
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
TargetLoweringOpt &TLO) const {
@@ -20146,8 +20163,7 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
if (!TLO.LegalOps)
return false;
- // Only optimize AND for now.
- if (Op.getOpcode() != ISD::AND)
+ if (!EnableOptimizeLogicalImm)
return false;
EVT VT = Op.getValueType();
@@ -20158,6 +20174,14 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
assert(VT == MVT::i32 && "Unexpected integer type");
+ // Exit early if we demand all bits.
+ if (DemandedBits.popcount() == 32)
+ return false;
+
+ // Only optimize AND for now.
+ if (Op.getOpcode() != ISD::AND)
+ return false;
+
// Make sure the RHS really is a constant.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!C)
@@ -20165,21 +20189,13 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
unsigned Mask = C->getZExtValue();
+ if (Mask == 0 || Mask == ~0U)
+ return false;
+
unsigned Demanded = DemandedBits.getZExtValue();
unsigned ShrunkMask = Mask & Demanded;
unsigned ExpandedMask = Mask | ~Demanded;
- // If the mask is all zeros, let the target-independent code replace the
- // result with zero.
- if (ShrunkMask == 0)
- return false;
-
- // If the mask is all ones, erase the AND. (Currently, the target-independent
- // code won't do this, so we have to do it explicitly to avoid an infinite
- // loop in obscure cases.)
- if (ExpandedMask == ~0U)
- return TLO.CombineTo(Op, Op.getOperand(0));
-
auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
};
@@ -20192,30 +20208,61 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
return TLO.CombineTo(Op, NewOp);
};
- // Prefer uxtb mask.
- if (IsLegalMask(0xFF))
- return UseMask(0xFF);
+ // If the mask is all zeros, let the target-independent code replace the
+ // result with zero.
+ if (ShrunkMask == 0) {
+ ++NumOptimizedImms;
+ return UseMask(ShrunkMask);
+ }
- // Prefer uxth mask.
- if (IsLegalMask(0xFFFF))
- return UseMask(0xFFFF);
+ // If the mask is all ones, erase the AND. (Currently, the target-independent
+ // code won't do this, so we have to do it explicitly to avoid an infinite
+ // loop in obscure cases.)
+ if (ExpandedMask == ~0U) {
+ ++NumOptimizedImms;
+ return UseMask(ExpandedMask);
+ }
- // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
- // FIXME: Prefer a contiguous sequence of bits for other optimizations.
- if (ShrunkMask < 256)
+ // If thumb, check for uxth and uxtb masks first and foremost.
+ if (Subtarget->isThumb1Only() && Subtarget->hasV6Ops()) {
+ if (IsLegalMask(0xFF)) {
+ ++NumOptimizedImms;
+ return UseMask(0xFF);
+ }
+
+ if (IsLegalMask(0xFFFF)) {
+ ++NumOptimizedImms;
+ return UseMask(0xFFFF);
+ }
+ }
+
+ // Don't optimize if it is legal already.
+ if (isLegalLogicalImmediate(Mask, Subtarget))
+ return false;
+
+ if (isLegalLogicalImmediate(ShrunkMask, Subtarget)) {
+ ++NumOptimizedImms;
return UseMask(ShrunkMask);
+ }
- // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
- // FIXME: Prefer a contiguous sequence of bits for other optimizations.
- if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
+ // FIXME: The check for v6 is because this interferes with some ubfx
+ // optimizations
+ if (!Subtarget->hasV6Ops() &&
+ isLegalLogicalImmediate(~ExpandedMask, Subtarget)) {
+ ++NumOptimizedImms;
return UseMask(ExpandedMask);
+ }
+
+ if ((~ExpandedMask) < 256) {
+ ++NumOptimizedImms;
+ return UseMask(ExpandedMask);
+ }
// Potential improvements:
//
// We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
// We could try to prefer Thumb1 immediates which can be lowered to a
// two-instruction sequence.
- // We could try to recognize more legal ARM/Thumb2 immediates here.
return false;
}
diff --git a/llvm/test/CodeGen/ARM/and-cmpz.ll b/llvm/test/CodeGen/ARM/and-cmpz.ll
index 1f72307f12a68..30d8079d4e4ba 100644
--- a/llvm/test/CodeGen/ARM/and-cmpz.ll
+++ b/llvm/test/CodeGen/ARM/and-cmpz.ll
@@ -92,7 +92,7 @@ false:
; T1: uxth r0, r0
; T1-NEXT: lsrs r0, r0, #9
; T1-NEXT: bne
-; T2: uxth r0, r0
+; T2: and r0, r0, #65024
; T2-NEXT: movs r2, #0
; T2-NEXT: cmp.w r2, r0, lsr #9
define void @i16_cmpz(i16 %x, ptr %foo) {
diff --git a/llvm/test/CodeGen/ARM/fpenv.ll b/llvm/test/CodeGen/ARM/fpenv.ll
index f5d87170d9153..57e264d97bc44 100644
--- a/llvm/test/CodeGen/ARM/fpenv.ll
+++ b/llvm/test/CodeGen/ARM/fpenv.ll
@@ -41,8 +41,8 @@ define void @func_05() {
; CHECK-LABEL: func_05:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmrs r0, fpscr
-; CHECK-NEXT: bic r0, r0, #12582912
; CHECK-NEXT: orr r0, r0, #4194304
+; CHECK-NEXT: bic r0, r0, #8388608
; CHECK-NEXT: vmsr fpscr, r0
; CHECK-NEXT: mov pc, lr
call void @llvm.set.rounding(i32 2)
@@ -53,8 +53,8 @@ define void @func_06() {
; CHECK-LABEL: func_06:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmrs r0, fpscr
-; CHECK-NEXT: bic r0, r0, #12582912
; CHECK-NEXT: orr r0, r0, #8388608
+; CHECK-NEXT: bic r0, r0, #4194304
; CHECK-NEXT: vmsr fpscr, r0
; CHECK-NEXT: mov pc, lr
call void @llvm.set.rounding(i32 3)
diff --git a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll
index a1b6847d623d0..6f34a5fd00314 100644
--- a/llvm/test/CodeGen/ARM/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/ARM/funnel-shift-rot.ll
@@ -19,7 +19,7 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
define i8 @rotl_i8_const_shift(i8 %x) {
; CHECK-LABEL: rotl_i8_const_shift:
; CHECK: @ %bb.0:
-; CHECK-NEXT: uxtb r1, r0
+; CHECK-NEXT: and r1, r0, #224
; CHECK-NEXT: lsl r0, r0, #3
; CHECK-NEXT: orr r0, r0, r1, lsr #5
; CHECK-NEXT: bx lr
@@ -161,8 +161,7 @@ define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) {
define i8 @rotr_i8_const_shift(i8 %x) {
; CHECK-LABEL: rotr_i8_const_shift:
; CHECK: @ %bb.0:
-; CHECK-NEXT: uxtb r1, r0
-; CHECK-NEXT: lsr r1, r1, #3
+; CHECK-NEXT: ubfx r1, r0, #3, #5
; CHECK-NEXT: orr r0, r1, r0, lsl #5
; CHECK-NEXT: bx lr
%f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 7cc623fb0a616..a21ac8944d7ad 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -21,9 +21,9 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
; ARM-LABEL: scalar_i8_signbit_eq:
; ARM: @ %bb.0:
; ARM-NEXT: uxtb r1, r1
-; ARM-NEXT: lsl r0, r0, r1
+; ARM-NEXT: mov r2, #128
+; ARM-NEXT: and r0, r2, r0, lsl r1
; ARM-NEXT: mov r1, #1
-; ARM-NEXT: uxtb r0, r0
; ARM-NEXT: eor r0, r1, r0, lsr #7
; ARM-NEXT: bx lr
;
@@ -42,7 +42,7 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
; THUMB78-NEXT: uxtb r1, r1
; THUMB78-NEXT: lsls r0, r1
; THUMB78-NEXT: movs r1, #1
-; THUMB78-NEXT: uxtb r0, r0
+; THUMB78-NEXT: and r0, r0, #128
; THUMB78-NEXT: eor.w r0, r1, r0, lsr #7
; THUMB78-NEXT: bx lr
%t0 = lshr i8 128, %y
@@ -122,9 +122,9 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
; ARM-LABEL: scalar_i16_signbit_eq:
; ARM: @ %bb.0:
; ARM-NEXT: uxth r1, r1
-; ARM-NEXT: lsl r0, r0, r1
+; ARM-NEXT: mov r2, #32768
+; ARM-NEXT: and r0, r2, r0, lsl r1
; ARM-NEXT: mov r1, #1
-; ARM-NEXT: uxth r0, r0
; ARM-NEXT: eor r0, r1, r0, lsr #15
; ARM-NEXT: bx lr
;
@@ -144,7 +144,7 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
; THUMB78-NEXT: uxth r1, r1
; THUMB78-NEXT: lsls r0, r1
; THUMB78-NEXT: movs r1, #1
-; THUMB78-NEXT: uxth r0, r0
+; THUMB78-NEXT: and r0, r0, #32768
; THUMB78-NEXT: eor.w r0, r1, r0, lsr #15
; THUMB78-NEXT: bx lr
%t0 = lshr i16 32768, %y
@@ -862,21 +862,35 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
;------------------------------------------------------------------------------;
define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
-; ARM-LABEL: scalar_i8_signbit_ne:
-; ARM: @ %bb.0:
-; ARM-NEXT: uxtb r1, r1
-; ARM-NEXT: lsl r0, r0, r1
-; ARM-NEXT: uxtb r0, r0
-; ARM-NEXT: lsr r0, r0, #7
-; ARM-NEXT: bx lr
+; ARM6-LABEL: scalar_i8_signbit_ne:
+; ARM6: @ %bb.0:
+; ARM6-NEXT: uxtb r1, r1
+; ARM6-NEXT: mov r2, #128
+; ARM6-NEXT: and r0, r2, r0, lsl r1
+; ARM6-NEXT: lsr r0, r0, #7
+; ARM6-NEXT: bx lr
;
-; THUMB-LABEL: scalar_i8_signbit_ne:
-; THUMB: @ %bb.0:
-; THUMB-NEXT: uxtb r1, r1
-; THUMB-NEXT: lsls r0, r1
-; THUMB-NEXT: uxtb r0, r0
-; THUMB-NEXT: lsrs r0, r0, #7
-; THUMB-NEXT: bx lr
+; ARM78-LABEL: scalar_i8_signbit_ne:
+; ARM78: @ %bb.0:
+; ARM78-NEXT: uxtb r1, r1
+; ARM78-NEXT: lsl r0, r0, r1
+; ARM78-NEXT: ubfx r0, r0, #7, #1
+; ARM78-NEXT: bx lr
+;
+; THUMB6-LABEL: scalar_i8_signbit_ne:
+; THUMB6: @ %bb.0:
+; THUMB6-NEXT: uxtb r1, r1
+; THUMB6-NEXT: lsls r0, r1
+; THUMB6-NEXT: uxtb r0, r0
+; THUMB6-NEXT: lsrs r0, r0, #7
+; THUMB6-NEXT: bx lr
+;
+; THUMB78-LABEL: scalar_i8_signbit_ne:
+; THUMB78: @ %bb.0:
+; THUMB78-NEXT: uxtb r1, r1
+; THUMB78-NEXT: lsls r0, r1
+; THUMB78-NEXT: ubfx r0, r0, #7, #1
+; THUMB78-NEXT: bx lr
%t0 = lshr i8 128, %y
%t1 = and i8 %t0, %x
%res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate
@@ -1051,3 +1065,5 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
%res = icmp eq i8 %t1, 1 ; should be comparing with 0
ret i1 %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; THUMB: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
index 5dbf8dd86b891..822bb89ecf22a 100644
--- a/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -53,10 +53,8 @@ define void @i24_and_or(ptr %a) {
define void @i24_insert_bit(ptr %a, i1 zeroext %bit) {
; LE-LABEL: i24_insert_bit:
; LE: @ %bb.0:
-; LE-NEXT: mov r3, #255
; LE-NEXT: ldrh r2, [r0]
-; LE-NEXT: orr r3, r3, #57088
-; LE-NEXT: and r2, r2, r3
+; LE-NEXT: bic r2, r2, #8192
; LE-NEXT: orr r1, r2, r1, lsl #13
; LE-NEXT: strh r1, [r0]
; LE-NEXT: mov pc, lr
@@ -64,8 +62,7 @@ define void @i24_insert_bit(ptr %a, i1 zeroext %bit) {
; BE-LABEL: i24_insert_bit:
; BE: @ %bb.0:
; BE-NEXT: ldrh r2, [r0]
-; BE-NEXT: mov r3, #57088
-; BE-NEXT: orr r3, r3, #16711680
+; BE-NEXT: mvn r3, #8192
; BE-NEXT: and r2, r3, r2, lsl #8
; BE-NEXT: orr r1, r2, r1, lsl #13
; BE-NEXT: lsr r1, r1, #8
@@ -144,8 +141,7 @@ define void @i56_insert_bit(ptr %a, i1 zeroext %bit) {
; BE-LABEL: i56_insert_bit:
; BE: @ %bb.0:
; BE-NEXT: ldrh r2, [r0, #4]!
-; BE-NEXT: mov r3, #57088
-; BE-NEXT: orr r3, r3, #16711680
+; BE-NEXT: mvn r3, #8192
; BE-NEXT: and r2, r3, r2, lsl #8
; BE-NEXT: orr r1, r2, r1, lsl #13
; BE-NEXT: lsr r1, r1, #8
diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
index a9eda31e729e2..4aa8f1a1ae923 100644
--- a/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
+++ b/llvm/test/CodeGen/ARM/sdiv-pow2-arm-size.ll
@@ -1,13 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV
; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV
; Check SREM
define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: test_rem
-; CHECK: asr r1, r0, #31
-; CHECK-NEXT: add r1, r0, r1, lsr #30
-; CHECK-NEXT: bic r1, r1, #3
-; CHECK-NEXT: sub r0, r0, r1
+; CHECK-LABEL: test_rem:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: asr r1, r0, #31
+; CHECK-NEXT: add r1, r0, r1, lsr #30
+; CHECK-NEXT: bic r1, r1, #3
+; CHECK-NEXT: sub r0, r0, r1
+; CHECK-NEXT: bx lr
entry:
%div = srem i32 %F, 4
@@ -16,18 +19,22 @@ entry:
; Try an i16 sdiv, with a small immediate.
define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f0
+; DIV-LABEL: f0:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #2
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: sxth r0, r0
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f0:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: and r1, r0, #32768
+; NODIV-NEXT: add r0, r0, r1, lsr #15
+; NODIV-NEXT: sxth r0, r0
+; NODIV-NEXT: asr r0, r0, #1
+; NODIV-NEXT: bx lr
-; DIV: mov r1, #2
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: sxth r0, r0
-; DIV-NEXT: bx lr
-; NODIV: uxth r1, r0
-; NODIV-NEXT: add r0, r0, r1, lsr #15
-; NODIV-NEXT: sxth r0, r0
-; NODIV-NEXT: asr r0, r0, #1
-; NODIV-NEXT: bx lr
entry:
%0 = sdiv i16 %F, 2
@@ -36,16 +43,20 @@ entry:
; Try an i32 sdiv, with a small immediate.
define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f1
+; DIV-LABEL: f1:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #4
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f1:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: asr r1, r0, #31
+; NODIV-NEXT: add r0, r0, r1, lsr #30
+; NODIV-NEXT: asr r0, r0, #2
+; NODIV-NEXT: bx lr
-; DIV: mov r1, #4
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: bx lr
-; NODIV: asr r1, r0, #31
-; NODIV-NEXT: add r0, r0, r1, lsr #30
-; NODIV-NEXT: asr r0, r0, #2
-; NODIV-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
@@ -55,10 +66,18 @@ entry:
; Try a large power of 2 immediate, which should also be materialised with 1
; move immediate instruction.
define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f2
-; DIV: mov r1, #131072
-; DIV-NEXT: sdiv r0, r0, r1
-; DIV-NEXT: bx lr
+; DIV-LABEL: f2:
+; DIV: @ %bb.0: @ %entry
+; DIV-NEXT: mov r1, #131072
+; DIV-NEXT: sdiv r0, r0, r1
+; DIV-NEXT: bx lr
+;
+; NODIV-LABEL: f2:
+; NODIV: @ %bb.0: @ %entry
+; NODIV-NEXT: asr r1, r0, #31
+; NODIV-NEXT: add r0, r0, r1, lsr #15
+; NODIV-NEXT: asr r0, r0, #17
+; NODIV-NEXT: bx lr
entry:
%div = sdiv i32 %F, 131072
ret i32 %div
@@ -66,11 +85,12 @@ entry:
; MinSize not set, so should expand to the faster but longer sequence.
define dso_local i32 @f3(i32 %F) {
-; CHECK-LABEL: f3
-; CHECK: asr r1, r0, #31
-; CHECK-NEXT: add r0, r0, r1, lsr #30
-; CHECK-NEXT: asr r0, r0, #2
-; CHECK-NEXT: bx lr
+; CHECK-LABEL: f3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: asr r1, r0, #31
+; CHECK-NEXT: add r0, r0, r1, lsr #30
+; CHECK-NEXT: asr r0, r0, #2
+; CHECK-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
ret i32 %div
diff --git a/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll b/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
index 4b0419577cdf0..7a93267fcc390 100644
--- a/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
+++ b/llvm/test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefixes=CHECK,T2
; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefixes=CHECK,T2
; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefixes=CHECK,T1
@@ -13,11 +14,21 @@
; Test sdiv i16
define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f0
-; CHECK: movs r1, #2
-; CHECK-NEXT: sdiv r0, r0, r1
-; CHECK-NEXT: sxth r0, r0
-; CHECK-NEXT: bx lr
+; CHECK-LABEL: f0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #2
+; CHECK-NEXT: sdiv r0, r0, r1
+; CHECK-NEXT: sxth r0, r0
+; CHECK-NEXT: bx lr
+;
+; V6M-LABEL: f0:
+; V6M: @ %bb.0: @ %entry
+; V6M-NEXT: uxth r1, r0
+; V6M-NEXT: lsrs r1, r1, #15
+; V6M-NEXT: adds r0, r0, r1
+; V6M-NEXT: sxth r0, r0
+; V6M-NEXT: asrs r0, r0, #1
+; V6M-NEXT: bx lr
entry:
%0 = sdiv i16 %F, 2
@@ -26,10 +37,19 @@ entry:
; Same as above, but now with i32
define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f1
-; CHECK: movs r1, #4
-; CHECK-NEXT: sdiv r0, r0, r1
-; CHECK-NEXT: bx lr
+; CHECK-LABEL: f1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #4
+; CHECK-NEXT: sdiv r0, r0, r1
+; CHECK-NEXT: bx lr
+;
+; V6M-LABEL: f1:
+; V6M: @ %bb.0: @ %entry
+; V6M-NEXT: asrs r1, r0, #31
+; V6M-NEXT: lsrs r1, r1, #30
+; V6M-NEXT: adds r0, r0, r1
+; V6M-NEXT: asrs r0, r0, #2
+; V6M-NEXT: bx lr
entry:
%div = sdiv i32 %F, 4
@@ -38,10 +58,18 @@ entry:
; The immediate is not a power of 2, so we expect a sdiv.
define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f2
-; CHECK: movs r1, #5
-; CHECK-NEXT: sdiv r0, r0, r1
-; CHECK-NEXT: bx lr
+; CHECK-LABEL: f2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movs r1, #5
+; CHECK-NEXT: sdiv r0, r0, r1
+; CHECK-NEXT: bx lr
+;
+; V6M-LABEL: f2:
+; V6M: @ %bb.0: @ %entry
+; V6M-NEXT: push {r7, lr}
+; V6M-NEXT: movs r1, #5
+; V6M-NEXT: bl __divsi3
+; V6M-NEXT: pop {r7, pc}
entry:
%div = sdiv i32 %F, 5
@@ -51,8 +79,28 @@ entry:
; Try a larger power of 2 immediate: immediates larger than
; 128 don't give any code size savings.
define dso_local i32 @f3(i32 %F) local_unnamed_addr #0 {
-; CHECK-LABEL: f3
-; CHECK-NOT: sdiv
+; T2-LABEL: f3:
+; T2: @ %bb.0: @ %entry
+; T2-NEXT: asrs r1, r0, #31
+; T2-NEXT: add.w r0, r0, r1, lsr #24
+; T2-NEXT: asrs r0, r0, #8
+; T2-NEXT: bx lr
+;
+; T1-LABEL: f3:
+; T1: @ %bb.0: @ %entry
+; T1-NEXT: asrs r1, r0, #31
+; T1-NEXT: lsrs r1, r1, #24
+; T1-NEXT: adds r0, r0, r1
+; T1-NEXT: asrs r0, r0, #8
+; T1-NEXT: bx lr
+;
+; V6M-LABEL: f3:
+; V6M: @ %bb.0: @ %entry
+; V6M-NEXT: asrs r1, r0, #31
+; V6M-NEXT: lsrs r1, r1, #24
+; V6M-NEXT: adds r0, r0, r1
+; V6M-NEXT: asrs r0, r0, #8
+; V6M-NEXT: bx lr
entry:
%div = sdiv i32 %F, 256
ret i32 %div
@@ -65,20 +113,32 @@ attributes #0 = { minsize norecurse nounwind optsize readnone }
; the sdiv to sdiv, but to the faster instruction sequence.
define dso_local signext i16 @f4(i16 signext %F) {
-; T2-LABEL: f4
-; T2: uxth r1, r0
-; T2-NEXT: add.w r0, r0, r1, lsr #15
-; T2-NEXT: sxth r0, r0
-; T2-NEXT: asrs r0, r0, #1
-; T2-NEXT: bx lr
-
-; T1-LABEL: f4
-; T1: uxth r1, r0
-; T1-NEXT: lsrs r1, r1, #15
-; T1-NEXT: adds r0, r0, r1
-; T1-NEXT: sxth r0, r0
-; T1-NEXT: asrs r0, r0, #1
-; T1-NEXT: bx lr
+; T2-LABEL: f4:
+; T2: @ %bb.0: @ %entry
+; T2-NEXT: and r1, r0, #32768
+; T2-NEXT: add.w r0, r0, r1, lsr #15
+; T2-NEXT: sxth r0, r0
+; T2-NEXT: asrs r0, r0, #1
+; T2-NEXT: bx lr
+;
+; T1-LABEL: f4:
+; T1: @ %bb.0: @ %entry
+; T1-NEXT: uxth r1, r0
+; T1-NEXT: lsrs r1, r1, #15
+; T1-NEXT: adds r0, r0, r1
+; T1-NEXT: sxth r0, r0
+; T1-NEXT: asrs r0, r0, #1
+; T1-NEXT: bx lr
+;
+; V6M-LABEL: f4:
+; V6M: @ %bb.0: @ %entry
+; V6M-NEXT: uxth r1, r0
+; V6M-NEXT: lsrs r1, r1, #15
+; V6M-NEXT: adds r0, r0, r1
+; V6M-NEXT: sxth r0, r0
+; V6M-NEXT: asrs r0, r0, #1
+; V6M-NEXT: bx lr
+
entry:
%0 = sdiv i16 %F, 2
@@ -86,18 +146,29 @@ entry:
}
define dso_local i32 @f5(i32 %F) {
-; T2-LABEL: f5
-; T2: asrs r1, r0, #31
-; T2-NEXT: add.w r0, r0, r1, lsr #30
-; T2-NEXT: asrs r0, r0, #2
-; T2-NEXT: bx lr
-
-; T1-LABEL: f5
-; T1: asrs r1, r0, #31
-; T1-NEXT: lsrs r1, r1, #30
-; T1-NEXT: adds r0, r0, r1
-; T1-NEXT: asrs r0, r0, #2
-; T1-NEXT: bx lr
+; T2-LABEL: f5:
+; T2: @ %bb.0: @ %entry
+; T2-NEXT: asrs r1, r0, #31
+; T2-NEXT: add.w r0, r0, r1, lsr #30
+; T2-NEXT: asrs r0, r0, #2
+; T2-NEXT: bx lr
+;
+; T1-LABEL: f5:
+; T1: @ %bb.0: @ %entry
+; T1-NEXT: asrs r1, r0, #31
+; T1-NEXT: lsrs r1, r1, #30
+; T1-NEXT: adds r0, r0, r1
+; T1-NEXT: asrs r0, r0, #2
+; T1-NEXT: bx lr
+;
+; V6M-LABEL: f5:
+; V6M: @ %bb.0: @ %entry
+; V6M-NEXT: asrs r1, r0, #31
+; V6M-NEXT: lsrs r1, r1, #30
+; V6M-NEXT: adds r0, r0, r1
+; V6M-NEXT: asrs r0, r0, #2
+; V6M-NEXT: bx lr
+
entry:
%div = sdiv i32 %F, 4
diff --git a/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll b/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
index 838da59f9e412..a5f3822bfa1ae 100644
--- a/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
+++ b/llvm/test/CodeGen/ARM/simplifysetcc_narrow_load.ll
@@ -24,9 +24,7 @@ define i1 @test_129_15_0(ptr %y) {
; CHECK-LE-LABEL: test_129_15_0:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: ldrh r0, [r0]
-; CHECK-LE-NEXT: mov r1, #255
-; CHECK-LE-NEXT: orr r1, r1, #32512
-; CHECK-LE-NEXT: ands r0, r0, r1
+; CHECK-LE-NEXT: bics r0, r0, #32768
; CHECK-LE-NEXT: movne r0, #1
; CHECK-LE-NEXT: mov pc, lr
;
diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
index b85cb3a4f191c..dae52a27d37f0 100644
--- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll
@@ -90,7 +90,7 @@ define i1 @test_urem_even(i27 %X) nounwind {
; ARM5: @ %bb.0:
; ARM5-NEXT: ldr r1, .LCPI1_0
; ARM5-NEXT: mul r2, r0, r1
-; ARM5-NEXT: bic r0, r2, #-134217727
+; ARM5-NEXT: bic r0, r2, #134217728
; ARM5-NEXT: lsr r0, r0, #1
; ARM5-NEXT: orr r0, r0, r2, lsl #26
; ARM5-NEXT: ldr r2, .LCPI1_1
@@ -333,11 +333,9 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; ARM5-NEXT: mov r3, #171
; ARM5-NEXT: orr r3, r3, #512
; ARM5-NEXT: mul r12, r0, r3
-; ARM5-NEXT: mov r0, #1020
-; ARM5-NEXT: orr r0, r0, #1024
; ARM5-NEXT: mov r3, #254
; ARM5-NEXT: orr r3, r3, #1792
-; ARM5-NEXT: and r0, r12, r0
+; ARM5-NEXT: bic r0, r12, #2048
; ARM5-NEXT: lsr r0, r0, #1
; ARM5-NEXT: orr r0, r0, r12, lsl #10
; ARM5-NEXT: sub r12, r1, #1
diff --git a/llvm/test/CodeGen/Thumb/bic_imm.ll b/llvm/test/CodeGen/Thumb/bic_imm.ll
index 741b2cf8db2e3..a2fc448670f0c 100644
--- a/llvm/test/CodeGen/Thumb/bic_imm.ll
+++ b/llvm/test/CodeGen/Thumb/bic_imm.ll
@@ -82,7 +82,7 @@ define void @truncated_neg256(i16 %a, ptr %p) {
;
; CHECK-T2-LABEL: truncated_neg256:
; CHECK-T2: @ %bb.0:
-; CHECK-T2-NEXT: bic r0, r0, #255
+; CHECK-T2-NEXT: and r0, r0, #65280
; CHECK-T2-NEXT: strh r0, [r1]
; CHECK-T2-NEXT: bx lr
%and = and i16 %a, -256
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index bcd92f81911b2..b75f1ff742bee 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -283,7 +283,7 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: adds r0, r2, #1
; CHECK-NEXT: movs r3, #1
-; CHECK-NEXT: bic r0, r0, #1
+; CHECK-NEXT: and r0, r0, #510
; CHECK-NEXT: subs r0, #2
; CHECK-NEXT: add.w r0, r3, r0, lsr #1
; CHECK-NEXT: dls lr, r0
>From 8e01b9b0864d8931edc6eb99d31255200d5db918 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Mon, 27 Oct 2025 14:24:55 -0400
Subject: [PATCH 2/2] [SelectionDAG] Optimize BSWAP yet again once more
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 32 ++++++---
llvm/lib/Target/ARM/README.txt | 26 -------
.../CodeGen/ARM/load-combine-big-endian.ll | 71 +++++++++----------
llvm/test/CodeGen/ARM/load-combine.ll | 53 +++++++-------
4 files changed, 78 insertions(+), 104 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index da4e40953b39a..eccf9e1b45097 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9899,17 +9899,29 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
// Use a rotate by 8. This can be further expanded if necessary.
return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
case MVT::i32:
- // This is meant for ARM speficially, which has ROTR but no ROTL.
+ // This is meant for ARM specifically, which has ROTR but no ROTL.
if (isOperationLegalOrCustom(ISD::ROTR, VT)) {
- SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT);
- // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF
- SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask);
- SDValue Rotr =
- DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT));
- SDValue Rotl =
- DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
- SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask);
- return DAG.getNode(ISD::OR, dl, VT, Rotr, And2);
+ // ror rtmp, r0, #16
+ SDValue Ror16 =
+ DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(16, dl, SHVT));
+ // eor r1, r0, rtmp ; r1 = r0 ^ (r0 ror 16)
+ SDValue Xor1 = DAG.getNode(ISD::XOR, dl, VT, Op, Ror16);
+
+ // bic r1, r1, #0xff0000 (clear bits 16-23)
+ // So we need the negated value: ~0x00FF0000 = 0xFF00FFFF
+ SDValue Mask = DAG.getConstant(0xFF00FFFFu, dl, VT);
+ SDValue BicResult = DAG.getNode(ISD::AND, dl, VT, Xor1, Mask);
+
+ // mov r1, r1, lsr #8
+ SDValue Lsr8 = DAG.getNode(ISD::SRL, dl, VT, BicResult,
+ DAG.getConstant(8, dl, SHVT));
+
+ // ror r0, r0, #8
+ SDValue Ror8 =
+ DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+
+ // eor r0, Lsr8, Ror8
+ return DAG.getNode(ISD::XOR, dl, VT, Lsr8, Ror8);
}
Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt
index ff84e07fa084a..0170cc9e4a17f 100644
--- a/llvm/lib/Target/ARM/README.txt
+++ b/llvm/lib/Target/ARM/README.txt
@@ -606,32 +606,6 @@ constant which was already loaded). Not sure what's necessary to do that.
//===---------------------------------------------------------------------===//
-The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
-
-int a(int x) { return __builtin_bswap32(x); }
-
-a:
- mov r1, #255, 24
- mov r2, #255, 16
- and r1, r1, r0, lsr #8
- and r2, r2, r0, lsl #8
- orr r1, r1, r0, lsr #24
- orr r0, r2, r0, lsl #24
- orr r0, r0, r1
- bx lr
-
-Something like the following would be better (fewer instructions/registers):
- eor r1, r0, r0, ror #16
- bic r1, r1, #0xff0000
- mov r1, r1, lsr #8
- eor r0, r1, r0, ror #8
- bx lr
-
-A custom Thumb version would also be a slight improvement over the generic
-version.
-
-//===---------------------------------------------------------------------===//
-
Consider the following simple C code:
void foo(unsigned char *a, unsigned char *b, int *c) {
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index 1d5c8589429a4..e12bf031b01ae 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -50,15 +50,13 @@ define i32 @load_i32_by_i8_big_endian(ptr %arg) {
; ptr p; // p is 4 byte aligned
; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
define i32 @load_i32_by_i8_bswap(ptr %arg) {
-; BSWAP is not supported by 32 bit target
; CHECK-LABEL: load_i32_by_i8_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -221,16 +219,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) {
define i64 @load_i64_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r2, #255
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: orr r2, r2, #16711680
-; CHECK-NEXT: and r3, r0, r2
-; CHECK-NEXT: and r0, r2, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r3, ror #8
-; CHECK-NEXT: and r3, r1, r2
-; CHECK-NEXT: and r1, r2, r1, ror #24
-; CHECK-NEXT: orr r1, r1, r3, ror #8
+; CHECK-NEXT: eor r2, r0, r0, ror #16
+; CHECK-NEXT: bic r2, r2, #16711680
+; CHECK-NEXT: lsr r2, r2, #8
+; CHECK-NEXT: eor r0, r2, r0, ror #8
+; CHECK-NEXT: eor r2, r1, r1, ror #16
+; CHECK-NEXT: bic r2, r2, #16711680
+; CHECK-NEXT: lsr r2, r2, #8
+; CHECK-NEXT: eor r1, r2, r1, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -370,12 +368,11 @@ define i64 @load_i64_by_i8(ptr %arg) {
define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #1]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
@@ -425,12 +422,11 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #-4]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
@@ -576,12 +572,11 @@ declare i16 @llvm.bswap.i16(i16)
define i32 @load_i32_by_bswap_i16(ptr %arg) {
; CHECK-LABEL: load_i32_by_bswap_i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
@@ -654,12 +649,11 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r0, r1
-; CHECK-NEXT: mov r1, #255
-; CHECK-NEXT: orr r1, r1, #16711680
; CHECK-NEXT: ldr r0, [r0, #12]
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
@@ -718,12 +712,11 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r1, r0
-; CHECK-NEXT: mov r1, #255
-; CHECK-NEXT: orr r1, r1, #16711680
; CHECK-NEXT: ldr r0, [r0, #13]
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 70873672d6523..59197cd63b474 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -114,15 +114,13 @@ define i32 @load_i32_by_i8_aligned(ptr %arg) {
; ptr p; // p is 4 byte aligned
; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
define i32 @load_i32_by_i8_bswap(ptr %arg) {
-; BSWAP is not supported by 32 bit target
; CHECK-LABEL: load_i32_by_i8_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -235,16 +233,16 @@ define i64 @load_i64_by_i8(ptr %arg) {
define i64 @load_i64_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r2, #255
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: orr r2, r2, #16711680
-; CHECK-NEXT: and r3, r0, r2
-; CHECK-NEXT: and r0, r2, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r3, ror #8
-; CHECK-NEXT: and r3, r1, r2
-; CHECK-NEXT: and r1, r2, r1, ror #24
-; CHECK-NEXT: orr r1, r1, r3, ror #8
+; CHECK-NEXT: eor r2, r0, r0, ror #16
+; CHECK-NEXT: bic r2, r2, #16711680
+; CHECK-NEXT: lsr r2, r2, #8
+; CHECK-NEXT: eor r0, r2, r0, ror #8
+; CHECK-NEXT: eor r2, r1, r1, ror #16
+; CHECK-NEXT: bic r2, r2, #16711680
+; CHECK-NEXT: lsr r2, r2, #8
+; CHECK-NEXT: eor r1, r2, r1, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -406,12 +404,11 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #1]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
@@ -460,12 +457,11 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0, #-4]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
@@ -516,12 +512,11 @@ declare i16 @llvm.bswap.i16(i16)
define i32 @load_i32_by_bswap_i16(ptr %arg) {
; CHECK-LABEL: load_i32_by_bswap_i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov r1, #255
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: orr r1, r1, #16711680
-; CHECK-NEXT: and r2, r0, r1
-; CHECK-NEXT: and r0, r1, r0, ror #24
-; CHECK-NEXT: orr r0, r0, r2, ror #8
+; CHECK-NEXT: eor r1, r0, r0, ror #16
+; CHECK-NEXT: bic r1, r1, #16711680
+; CHECK-NEXT: lsr r1, r1, #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
More information about the llvm-commits
mailing list