[llvm] [AArch64] Select REV16 for zext(bswap(i16)) (PR #189576)

Tue Mar 31 02:59:34 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Valeriy Savchenko (SavchenkoValeriy)

<details>
<summary>Changes</summary>

Extend the existing any_extend(bswap i16) -> rev16 combine to also handle zero_extend. REV16 preserves a zero upper half, so for i16 loads this saves one instruction: ldrh+rev+lsr#16 -> ldrh+rev16.

---
Full diff: https://github.com/llvm/llvm-project/pull/189576.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+17-13) 
- (modified) llvm/test/CodeGen/AArch64/bswap.ll (+11-5) 
- (modified) llvm/test/CodeGen/AArch64/memcmp.ll (+15-20) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb6e9146e3839..766cc3dcde542 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24551,23 +24551,27 @@ static SDValue performExtendCombine(SDNode *N,
       N->getOperand(0)->getOpcode() == ISD::SETCC)
     return performSignExtendSetCCCombine(N, DCI, DAG);
 
-  // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
-  // that the top half of the result register must be unused, due to the
-  // any_extend. This means that we can replace this pattern with (rev16
-  // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
-  // ...)), which is what this pattern would otherwise be lowered to.
-  // Only apply this optimisation if any_extend in original pattern to i32 or
-  // i64, because this type will become the input type to REV16 in the new
-  // pattern, so must be a legitimate REV16 input type.
+  // If we see ({any,zero}_extend (bswap ...)) with bswap returning an i16, we
+  // can replace this pattern with (rev16 ({any,zero}_extend ...)). This saves
+  // a machine instruction compared to (lsr (rev ...)) or (and (rev16 ..)),
+  // which is what this pattern would otherwise be lowered to.
+  // For any_extend: the top half of the result is unused, so rev16 is correct.
+  // For zero_extend: rev16 preserves the zero upper half when the input is
+  // zero-extended (e.g. from LDRHHui), because it swaps bytes within each
+  // 16-bit half independently.
+  // Only apply this optimisation if extending to i32 or i64, because this type
+  // will become the input type to REV16 in the new pattern, so must be a
+  // legitimate REV16 input type.
   SDValue Bswap = N->getOperand(0);
-  if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
-      Bswap.getValueType() == MVT::i16 &&
+  if ((N->getOpcode() == ISD::ANY_EXTEND ||
+       N->getOpcode() == ISD::ZERO_EXTEND) &&
+      Bswap.getOpcode() == ISD::BSWAP && Bswap.getValueType() == MVT::i16 &&
       (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
     SDLoc DL(N);
-    SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
-                                       Bswap->getOperand(0));
+    SDValue NewExtend = DAG.getNode(N->getOpcode(), DL, N->getValueType(0),
+                                    Bswap->getOperand(0));
     return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
-                       NewAnyExtend);
+                       NewExtend);
   }
 
   if (SDValue R = performExtendDuplaneTruncCombine(N, DAG))
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 898958fb4993f..f5a455917a1e3 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -70,11 +70,17 @@ define i128 @bswap_i16_to_i128_anyext(i16 %a) {
 }
 
 define i32 @bswap_i16_to_i32_zext(i16 %a){
-; CHECK-LABEL: bswap_i16_to_i32_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    rev w8, w0
-; CHECK-NEXT:    lsr w0, w8, #16
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: bswap_i16_to_i32_zext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    rev16 w0, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_i16_to_i32_zext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    rev w8, w0
+; CHECK-GI-NEXT:    lsr w0, w8, #16
+; CHECK-GI-NEXT:    ret
   %3 = call i16 @llvm.bswap.i16(i16 %a)
   %4 = zext i16 %3 to i32
   ret i32 %4
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
index 98ea86b06d6c5..8343ae9d1efe1 100644
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -39,10 +39,9 @@ define i32 @length2(ptr %X, ptr %Y) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w0, w8, w9, lsr #16
+; CHECK-NEXT:    rev16 w8, w8
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    sub w0, w8, w9
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
   ret i32 %m
@@ -53,8 +52,8 @@ define i32 @length2_const(ptr %X, ptr %Y) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w0, w8, w9, lsr #16
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    add w0, w9, w8
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
   ret i32 %m
@@ -65,8 +64,8 @@ define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w8, w8, w9, lsr #16
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    add w8, w9, w8
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -93,10 +92,9 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
+; CHECK-NEXT:    rev16 w8, w8
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    sub w8, w8, w9
 ; CHECK-NEXT:    lsr w0, w8, #31
 ; CHECK-NEXT:    ret
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
@@ -109,10 +107,9 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
 ; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
+; CHECK-NEXT:    rev16 w8, w8
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    sub w8, w8, w9
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    cset w0, gt
 ; CHECK-NEXT:    ret
@@ -561,10 +558,8 @@ define i32 @length10(ptr %X, ptr %Y) nounwind {
 ; CHECK-NEXT:  // %bb.1: // %loadbb1
 ; CHECK-NEXT:    ldrh w8, [x0, #8]
 ; CHECK-NEXT:    ldrh w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    rev16 x8, x8
+; CHECK-NEXT:    rev16 x9, x9
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    b.ne .LBB34_3
 ; CHECK-NEXT:  // %bb.2:

``````````

</details>


https://github.com/llvm/llvm-project/pull/189576