[llvm] [ARM] Lower BSWAP on Pre-V6 ARM (PR #154811)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 21 11:18:34 PDT 2025
https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/154811
>From 62b080cbd789ed76fc5427cc1df19193e7d4bba6 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Thu, 21 Aug 2025 12:20:02 -0400
Subject: [PATCH] [ARM] Lower BSWAP on Pre-V6 ARM
---
llvm/lib/Target/ARM/ARMISelLowering.cpp | 39 ++++++++-
llvm/lib/Target/ARM/README.txt | 26 ------
.../CodeGen/ARM/load-combine-big-endian.ll | 86 +++++++------------
llvm/test/CodeGen/ARM/load-combine.ll | 65 +++++---------
4 files changed, 92 insertions(+), 124 deletions(-)
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 830156359e9e8..35a67ae948758 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1064,7 +1064,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
// Only ARMv6 has BSWAP.
if (!Subtarget->hasV6Ops())
- setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+ setOperationAction(ISD::BSWAP, MVT::i32, Custom);
bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
: Subtarget->hasDivideInARMMode();
@@ -9508,6 +9508,41 @@ static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
return false;
}
+static SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) {
+
+ // eor r3, r0, r0, ror #16
+ // lsr r3, r3, #8
+ // bic r3, r3, #0xFF
+ // eor r0, r3, r0, ror #8
+
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+
+ // ror rtmp, r0, #16
+ SDValue Ror16 = DAG.getNode(ISD::ROTR, DL, MVT::i32, Src,
+ DAG.getConstant(16, DL, MVT::i32));
+ // eor r1, r0, rtmp ; r1 = r0 ^ (r0 ror 16)
+ SDValue Xor1 = DAG.getNode(ISD::XOR, DL, MVT::i32, Src, Ror16);
+
+ // lsr r1, r1, #8
+ SDValue Lsr8 = DAG.getNode(ISD::SRL, DL, MVT::i32, Xor1,
+ DAG.getConstant(8, DL, MVT::i32));
+
+ // bic r1, r1, #0xFF (clear bottom byte)
+ SDValue Mask = DAG.getConstant(0xFFFFFF00u, DL, MVT::i32);
+
+ SDValue AndMasked = DAG.getNode(ISD::AND, DL, MVT::i32, Lsr8, Mask);
+
+ // ror r0, r0, #8
+ SDValue Ror8 = DAG.getNode(ISD::ROTR, DL, MVT::i32, Src,
+ DAG.getConstant(8, DL, MVT::i32));
+
+ // eor r0, AndMasked, Ror8
+ SDValue Result = DAG.getNode(ISD::XOR, DL, MVT::i32, AndMasked, Ror8);
+
+ return Result;
+}
+
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
@@ -10708,6 +10743,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UCMP:
case ISD::SCMP:
return LowerCMP(Op, DAG);
+ case ISD::BSWAP:
+ return LowerBSWAP(Op, DAG);
}
}
diff --git a/llvm/lib/Target/ARM/README.txt b/llvm/lib/Target/ARM/README.txt
index def67cfae7277..15d4447f8c649 100644
--- a/llvm/lib/Target/ARM/README.txt
+++ b/llvm/lib/Target/ARM/README.txt
@@ -606,32 +606,6 @@ constant which was already loaded). Not sure what's necessary to do that.
//===---------------------------------------------------------------------===//
-The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
-
-int a(int x) { return __builtin_bswap32(x); }
-
-a:
- mov r1, #255, 24
- mov r2, #255, 16
- and r1, r1, r0, lsr #8
- and r2, r2, r0, lsl #8
- orr r1, r1, r0, lsr #24
- orr r0, r2, r0, lsl #24
- orr r0, r0, r1
- bx lr
-
-Something like the following would be better (fewer instructions/registers):
- eor r1, r0, r0, ror #16
- bic r1, r1, #0xff0000
- mov r1, r1, lsr #8
- eor r0, r1, r0, ror #8
- bx lr
-
-A custom Thumb version would also be a slight improvement over the generic
-version.
-
-//===---------------------------------------------------------------------===//
-
Consider the following simple C code:
void foo(unsigned char *a, unsigned char *b, int *c) {
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index 4b6d14efd0ecb..b4a4d119210dd 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -54,13 +54,10 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_bswap:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -224,20 +221,14 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: mov r12, #65280
+; CHECK-NEXT: mvn r3, #255
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r3, r12, r0, lsr #8
-; CHECK-NEXT: orr r3, r3, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: and r2, r12, r1, lsr #8
-; CHECK-NEXT: orr r0, r0, r3
-; CHECK-NEXT: and r3, r1, #65280
-; CHECK-NEXT: orr r2, r2, r1, lsr #24
-; CHECK-NEXT: lsl r1, r1, #24
-; CHECK-NEXT: orr r1, r1, r3, lsl #8
-; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: eor r2, r0, r0, lsl #16
+; CHECK-NEXT: and r2, r3, r2, lsr #8
+; CHECK-NEXT: eor r0, r2, r0, ror #8
+; CHECK-NEXT: eor r2, r1, r1, lsl #16
+; CHECK-NEXT: and r2, r3, r2, lsr #8
+; CHECK-NEXT: eor r1, r2, r1, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -378,13 +369,10 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #1]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
@@ -435,13 +423,10 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #-4]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
@@ -588,13 +573,10 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) {
; CHECK-LABEL: load_i32_by_bswap_i16:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
@@ -667,14 +649,11 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r0, r1
-; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mvn r2, #255
; CHECK-NEXT: ldr r0, [r0, #12]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
@@ -733,14 +712,11 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
; CHECK: @ %bb.0:
; CHECK-NEXT: add r0, r1, r0
-; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mvn r2, #255
; CHECK-NEXT: ldr r0, [r0, #13]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 0f6ec8aa47386..780b025a30d91 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -114,17 +114,13 @@ define i32 @load_i32_by_i8_aligned(ptr %arg) {
; ptr p; // p is 4 byte aligned
; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
define i32 @load_i32_by_i8_bswap(ptr %arg) {
-; BSWAP is not supported by 32 bit target
; CHECK-LABEL: load_i32_by_i8_bswap:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -238,20 +234,14 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) {
; CHECK-LABEL: load_i64_by_i8_bswap:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: mov r12, #65280
+; CHECK-NEXT: mvn r3, #255
; CHECK-NEXT: ldr r0, [r0, #4]
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r3, r12, r0, lsr #8
-; CHECK-NEXT: orr r3, r3, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: and r2, r12, r1, lsr #8
-; CHECK-NEXT: orr r0, r0, r3
-; CHECK-NEXT: and r3, r1, #65280
-; CHECK-NEXT: orr r2, r2, r1, lsr #24
-; CHECK-NEXT: lsl r1, r1, #24
-; CHECK-NEXT: orr r1, r1, r3, lsl #8
-; CHECK-NEXT: orr r1, r1, r2
+; CHECK-NEXT: eor r2, r0, r0, lsl #16
+; CHECK-NEXT: and r2, r3, r2, lsr #8
+; CHECK-NEXT: eor r0, r2, r0, ror #8
+; CHECK-NEXT: eor r2, r1, r1, lsl #16
+; CHECK-NEXT: and r2, r3, r2, lsr #8
+; CHECK-NEXT: eor r1, r2, r1, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -414,13 +404,10 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #1]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
@@ -470,13 +457,10 @@ define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0, #-4]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
@@ -528,13 +512,10 @@ define i32 @load_i32_by_bswap_i16(ptr %arg) {
; CHECK-LABEL: load_i32_by_bswap_i16:
; CHECK: @ %bb.0:
; CHECK-NEXT: ldr r0, [r0]
-; CHECK-NEXT: mov r1, #65280
-; CHECK-NEXT: and r2, r0, #65280
-; CHECK-NEXT: and r1, r1, r0, lsr #8
-; CHECK-NEXT: orr r1, r1, r0, lsr #24
-; CHECK-NEXT: lsl r0, r0, #24
-; CHECK-NEXT: orr r0, r0, r2, lsl #8
-; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mvn r2, #255
+; CHECK-NEXT: eor r1, r0, r0, lsl #16
+; CHECK-NEXT: and r1, r2, r1, lsr #8
+; CHECK-NEXT: eor r0, r1, r0, ror #8
; CHECK-NEXT: mov pc, lr
;
; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
More information about the llvm-commits
mailing list