[llvm] [AArch64] Select REV16 for zext(bswap(i16)) (PR #189576)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 31 02:59:34 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Valeriy Savchenko (SavchenkoValeriy)
<details>
<summary>Changes</summary>
Extend the existing any_extend(bswap i16) -> rev16 combine to also handle zero_extend. REV16 preserves a zero upper half, so for i16 loads this saves one instruction: ldrh+rev+lsr#<!-- -->16 -> ldrh+rev16.
---
Full diff: https://github.com/llvm/llvm-project/pull/189576.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+17-13)
- (modified) llvm/test/CodeGen/AArch64/bswap.ll (+11-5)
- (modified) llvm/test/CodeGen/AArch64/memcmp.ll (+15-20)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb6e9146e3839..766cc3dcde542 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24551,23 +24551,27 @@ static SDValue performExtendCombine(SDNode *N,
N->getOperand(0)->getOpcode() == ISD::SETCC)
return performSignExtendSetCCCombine(N, DCI, DAG);
- // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
- // that the top half of the result register must be unused, due to the
- // any_extend. This means that we can replace this pattern with (rev16
- // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
- // ...)), which is what this pattern would otherwise be lowered to.
- // Only apply this optimisation if any_extend in original pattern to i32 or
- // i64, because this type will become the input type to REV16 in the new
- // pattern, so must be a legitimate REV16 input type.
+ // If we see ({any,zero}_extend (bswap ...)) with bswap returning an i16, we
+ // can replace this pattern with (rev16 ({any,zero}_extend ...)). This saves
+ // a machine instruction compared to (lsr (rev ...)) or (and (rev16 ..)),
+ // which is what this pattern would otherwise be lowered to.
+ // For any_extend: the top half of the result is unused, so rev16 is correct.
+ // For zero_extend: rev16 preserves the zero upper half when the input is
+ // zero-extended (e.g. from LDRHHui), because it swaps bytes within each
+ // 16-bit half independently.
+ // Only apply this optimisation if extending to i32 or i64, because this type
+ // will become the input type to REV16 in the new pattern, so must be a
+ // legitimate REV16 input type.
SDValue Bswap = N->getOperand(0);
- if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
- Bswap.getValueType() == MVT::i16 &&
+ if ((N->getOpcode() == ISD::ANY_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND) &&
+ Bswap.getOpcode() == ISD::BSWAP && Bswap.getValueType() == MVT::i16 &&
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
SDLoc DL(N);
- SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
- Bswap->getOperand(0));
+ SDValue NewExtend = DAG.getNode(N->getOpcode(), DL, N->getValueType(0),
+ Bswap->getOperand(0));
return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
- NewAnyExtend);
+ NewExtend);
}
if (SDValue R = performExtendDuplaneTruncCombine(N, DAG))
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 898958fb4993f..f5a455917a1e3 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -70,11 +70,17 @@ define i128 @bswap_i16_to_i128_anyext(i16 %a) {
}
define i32 @bswap_i16_to_i32_zext(i16 %a){
-; CHECK-LABEL: bswap_i16_to_i32_zext:
-; CHECK: // %bb.0:
-; CHECK-NEXT: rev w8, w0
-; CHECK-NEXT: lsr w0, w8, #16
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: bswap_i16_to_i32_zext:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: and w8, w0, #0xffff
+; CHECK-SD-NEXT: rev16 w0, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: bswap_i16_to_i32_zext:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: rev w8, w0
+; CHECK-GI-NEXT: lsr w0, w8, #16
+; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i32
ret i32 %4
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
index 98ea86b06d6c5..8343ae9d1efe1 100644
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -39,10 +39,9 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: ldrh w9, [x1]
-; CHECK-NEXT: rev w8, w8
-; CHECK-NEXT: rev w9, w9
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: sub w0, w8, w9, lsr #16
+; CHECK-NEXT: rev16 w8, w8
+; CHECK-NEXT: rev16 w9, w9
+; CHECK-NEXT: sub w0, w8, w9
; CHECK-NEXT: ret
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
ret i32 %m
@@ -53,8 +52,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT: rev w9, w9
-; CHECK-NEXT: add w0, w8, w9, lsr #16
+; CHECK-NEXT: rev16 w9, w9
+; CHECK-NEXT: add w0, w9, w8
; CHECK-NEXT: ret
%m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
ret i32 %m
@@ -65,8 +64,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w9, [x0]
; CHECK-NEXT: mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT: rev w9, w9
-; CHECK-NEXT: add w8, w8, w9, lsr #16
+; CHECK-NEXT: rev16 w9, w9
+; CHECK-NEXT: add w8, w9, w8
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: cset w0, gt
; CHECK-NEXT: ret
@@ -93,10 +92,9 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: ldrh w9, [x1]
-; CHECK-NEXT: rev w8, w8
-; CHECK-NEXT: rev w9, w9
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: rev16 w8, w8
+; CHECK-NEXT: rev16 w9, w9
+; CHECK-NEXT: sub w8, w8, w9
; CHECK-NEXT: lsr w0, w8, #31
; CHECK-NEXT: ret
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -109,10 +107,9 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: ldrh w9, [x1]
-; CHECK-NEXT: rev w8, w8
-; CHECK-NEXT: rev w9, w9
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: sub w8, w8, w9, lsr #16
+; CHECK-NEXT: rev16 w8, w8
+; CHECK-NEXT: rev16 w9, w9
+; CHECK-NEXT: sub w8, w8, w9
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: cset w0, gt
; CHECK-NEXT: ret
@@ -561,10 +558,8 @@ define i32 @length10(ptr %X, ptr %Y) nounwind {
; CHECK-NEXT: // %bb.1: // %loadbb1
; CHECK-NEXT: ldrh w8, [x0, #8]
; CHECK-NEXT: ldrh w9, [x1, #8]
-; CHECK-NEXT: rev w8, w8
-; CHECK-NEXT: rev w9, w9
-; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: lsr w9, w9, #16
+; CHECK-NEXT: rev16 x8, x8
+; CHECK-NEXT: rev16 x9, x9
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: b.ne .LBB34_3
; CHECK-NEXT: // %bb.2:
``````````
</details>
https://github.com/llvm/llvm-project/pull/189576
More information about the llvm-commits
mailing list