[llvm] [AArch64] Avoid selecting XAR for reverse operations. (PR #178706)

Thu Jan 29 09:32:56 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Ricardo Jesus (rj-jesus)

<details>
<summary>Changes</summary>

Rotations that implement reverse operations, for example:
```c
uint64x2_t foo(uint64x2_t r) {
  return (r >> 32) | (r << 32);
}
```
Are currently lowered as XAR (when available):
```gas
foo:
  movi    v1.2d, #0000000000000000
  xar     v0.2d, v0.2d, v1.2d, #32
  ret
```
This is suboptimal as REV* instructions typically have higher throughput than XAR and do not require the zero operand.

This patch avoids selecting XAR when the operation can be implemented by Neon or SVE REV* instructions. It also adds patterns for selecting the latter in cases where they were missing.

https://godbolt.org/z/z9Y6xbr5W

I can break the PR into separate ones if preferred. 


---
Full diff: https://github.com/llvm/llvm-project/pull/178706.diff


5 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+8-1) 
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+9) 
- (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+8) 
- (modified) llvm/test/CodeGen/AArch64/sve2-xar.ll (+32) 
- (modified) llvm/test/CodeGen/AArch64/xar.ll (+59-3) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7b566e432b6bc..b819b3d944a22 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4616,7 +4616,11 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
         !ISD::isConstantSplatVector(N1.getOperand(2).getNode(), ShrAmt))
       return false;
 
-    if (ShlAmt + ShrAmt != VT.getScalarSizeInBits())
+    unsigned VTSizeInBits = VT.getScalarSizeInBits();
+    if (ShlAmt + ShrAmt != VTSizeInBits)
+      return false;
+    if ((VTSizeInBits == 32 || VTSizeInBits == 64) &&
+        VTSizeInBits == ShlAmt * 2)
       return false;
 
     if (!IsXOROperand) {
@@ -4700,6 +4704,9 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
   unsigned VTSizeInBits = VT.getScalarSizeInBits();
   if (ShAmt + HsAmt != VTSizeInBits)
     return false;
+  if ((VTSizeInBits == 16 || VTSizeInBits == 32 || VTSizeInBits == 64) &&
+      VTSizeInBits == ShAmt * 2)
+    return false;
 
   if (!IsXOROperand) {
     SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i64);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 96d45d4d48652..2e6df16b136dd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6022,12 +6022,21 @@ def : Pat<(v2i64 (bswap (v2i64 V128:$Rn))),
 def : Pat<(v2i64 (or (v2i64 (AArch64vshl (v2i64 V128:$Rn), (i32 32))),
                      (v2i64 (AArch64vlshr (v2i64 V128:$Rn), (i32 32))))),
           (v2i64 (REV64v4i32 (v2i64 V128:$Rn)))>;
+def : Pat<(v1i64 (or (v1i64 (AArch64vshl (v1i64 V64:$Rn), (i32 32))),
+                     (v1i64 (AArch64vlshr (v1i64 V64:$Rn), (i32 32))))),
+          (v1i64 (REV64v2i32 (v1i64 V64:$Rn)))>;
 def : Pat<(v4i32 (or (v4i32 (AArch64vshl (v4i32 V128:$Rn), (i32 16))),
                      (v4i32 (AArch64vlshr (v4i32 V128:$Rn), (i32 16))))),
           (v4i32 (REV32v8i16 (v4i32 V128:$Rn)))>;
 def : Pat<(v2i32 (or (v2i32 (AArch64vshl (v2i32 V64:$Rn), (i32 16))),
                      (v2i32 (AArch64vlshr (v2i32 V64:$Rn), (i32 16))))),
           (v2i32 (REV32v4i16 (v2i32 V64:$Rn)))>;
+def : Pat<(v8i16 (or (v8i16 (AArch64vshl (v8i16 V128:$Rn), (i32 8))),
+                     (v8i16 (AArch64vlshr (v8i16 V128:$Rn), (i32 8))))),
+          (v8i16 (REV16v16i8 (v8i16 V128:$Rn)))>;
+def : Pat<(v4i16 (or (v4i16 (AArch64vshl (v4i16 V64:$Rn), (i32 8))),
+                     (v4i16 (AArch64vlshr (v4i16 V64:$Rn), (i32 8))))),
+          (v4i16 (REV16v8i8 (v4i16 V64:$Rn)))>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three vector instructions.
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dd90bf2622ec3..f73d22ee57595 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1068,6 +1068,14 @@ let Predicates = [HasSVE_or_SME] in {
   defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", AArch64revh_mt>;
   defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", AArch64revw_mt>;
 
+  def : Pat<(nxv4i32 (or (nxv4i32 (AArch64lsl_p nxv4i1:$pg, nxv4i32:$op, (nxv4i32 (splat_vector (i32 16))))),
+                         (nxv4i32 (AArch64lsr_p nxv4i1:$pg, nxv4i32:$op, (nxv4i32 (splat_vector (i32 16))))))),
+            (REVH_ZPmZ_S (IMPLICIT_DEF), $pg, $op)>;
+
+  def : Pat<(nxv2i64 (or (nxv2i64 (AArch64lsl_p nxv2i1:$pg, nxv2i64:$op, (nxv2i64 (splat_vector (i64 32))))),
+                         (nxv2i64 (AArch64lsr_p nxv2i1:$pg, nxv2i64:$op, (nxv2i64 (splat_vector (i64 32))))))),
+            (REVW_ZPmZ_D (IMPLICIT_DEF), $pg, $op)>;
+
   defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse, int_aarch64_sve_rev_b16, int_aarch64_sve_rev_b32, int_aarch64_sve_rev_b64>;
   defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>;
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 8f6f4510d8388..4989cc19600e9 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -296,6 +296,38 @@ define <vscale x 2 x i64> @xar_nxv2i64_shifts_neg(<vscale x 2 x i64> %x, <vscale
     ret <vscale x 2 x i64> %or
 }
 
+; Don't use XAR if REV[BHW] can be used.
+
+define <vscale x 8 x i16> @revb_nxv8i16(<vscale x 8 x i16> %r) #1 {
+; CHECK-LABEL: revb_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %or = tail call <vscale x 8 x i16> @llvm.fshl(<vscale x 8 x i16> %r, <vscale x 8 x i16> %r, <vscale x 8 x i16> splat (i16 8))
+  ret <vscale x 8 x i16> %or
+}
+
+define <vscale x 4 x i32> @revh_nxv4i32(<vscale x 4 x i32> %r) #1 {
+; CHECK-LABEL: revh_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revh z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %or = tail call <vscale x 4 x i32> @llvm.fshl(<vscale x 4 x i32> %r, <vscale x 4 x i32> %r, <vscale x 4 x i32> splat (i32 16))
+  ret <vscale x 4 x i32> %or
+}
+
+define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) #1 {
+; CHECK-LABEL: revw_nx2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %or = tail call <vscale x 2 x i64> @llvm.fshl(<vscale x 2 x i64> %r, <vscale x 2 x i64> %r, <vscale x 2 x i64> splat (i64 32))
+  ret <vscale x 2 x i64> %or
+}
+
 declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
 declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
 declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index 652617b58eaf3..c2cca5c20bd2a 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=CHECK,SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=CHECK,NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefixes=CHECK,SVE2 %s
 
 /* 128-bit vectors */
 
@@ -359,6 +359,62 @@ entry:
   ret <8 x i8> %or
 }
 
+; Don't use XAR if REV16/REV32/REV64 can be used.
+
+define <4 x i16> @rev16_v4i16(<4 x i16> %r) {
+; CHECK-LABEL: rev16_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    ret
+  %or = tail call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 8))
+  ret <4 x i16> %or
+}
+
+define <2 x i32> @rev32_v2i32(<2 x i32> %r) {
+; CHECK-LABEL: rev32_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %or = tail call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 16))
+  ret <2 x i32> %or
+}
+
+define <1 x i64> @rev64_v1i64(<1 x i64> %r) {
+; CHECK-LABEL: rev64_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-NEXT:    ret
+  %or = tail call <1 x i64> @llvm.fshl(<1 x i64> %r, <1 x i64> %r, <1 x i64> splat (i64 32))
+  ret <1 x i64> %or
+}
+
+define <8 x i16> @rev16_v8i16(<8 x i16> %r) {
+; CHECK-LABEL: rev16_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    ret
+  %or = tail call <8 x i16> @llvm.fshl(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 8))
+  ret <8 x i16> %or
+}
+
+define <4 x i32> @rev32_v4i32(<4 x i32> %r) {
+; CHECK-LABEL: rev32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev32 v0.8h, v0.8h
+; CHECK-NEXT:    ret
+  %or = tail call <4 x i32> @llvm.fshl(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 16))
+  ret <4 x i32> %or
+}
+
+define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
+; CHECK-LABEL: rev64_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ret
+  %or = tail call <2 x i64> @llvm.fshl(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 32))
+  ret <2 x i64> %or
+}
+
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)

``````````

</details>


https://github.com/llvm/llvm-project/pull/178706