[llvm] [AArch64] Avoid selecting XAR for reverse operations. (PR #178706)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 9 05:46:06 PST 2026
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/178706
>From 3e8dbe8ae64cb398ccd79717c5ac7a156ab19c05 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 29 Jan 2026 05:19:46 -0800
Subject: [PATCH 1/4] Add tests.
---
llvm/test/CodeGen/AArch64/sve2-xar.ll | 46 ++++++++++
llvm/test/CodeGen/AArch64/xar.ll | 120 +++++++++++++++++++++++++-
2 files changed, 163 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 8f6f4510d8388..a6d6ae711db16 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -296,6 +296,52 @@ define <vscale x 2 x i64> @xar_nxv2i64_shifts_neg(<vscale x 2 x i64> %x, <vscale
ret <vscale x 2 x i64> %or
}
+; Don't use XAR if REV[BHW] can be used.
+
+define <vscale x 8 x i16> @revb_nxv8i16(<vscale x 8 x i16> %r) {
+; CHECK-LABEL: revb_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: revb z0.h, p0/m, z0.h
+; CHECK-NEXT: ret
+ %or = tail call <vscale x 8 x i16> @llvm.fshl(<vscale x 8 x i16> %r, <vscale x 8 x i16> %r, <vscale x 8 x i16> splat (i16 8))
+ ret <vscale x 8 x i16> %or
+}
+
+define <vscale x 4 x i32> @revh_nxv4i32(<vscale x 4 x i32> %r) {
+; SVE-LABEL: revh_nxv4i32:
+; SVE: // %bb.0:
+; SVE-NEXT: lsr z1.s, z0.s, #16
+; SVE-NEXT: lsl z0.s, z0.s, #16
+; SVE-NEXT: orr z0.d, z0.d, z1.d
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: revh_nxv4i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: xar z0.s, z0.s, z1.s, #16
+; SVE2-NEXT: ret
+ %or = tail call <vscale x 4 x i32> @llvm.fshl(<vscale x 4 x i32> %r, <vscale x 4 x i32> %r, <vscale x 4 x i32> splat (i32 16))
+ ret <vscale x 4 x i32> %or
+}
+
+define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) {
+; SVE-LABEL: revw_nx2i64:
+; SVE: // %bb.0:
+; SVE-NEXT: lsr z1.d, z0.d, #32
+; SVE-NEXT: lsl z0.d, z0.d, #32
+; SVE-NEXT: orr z0.d, z0.d, z1.d
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: revw_nx2i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
+; SVE2-NEXT: ret
+ %or = tail call <vscale x 2 x i64> @llvm.fshl(<vscale x 2 x i64> %r, <vscale x 2 x i64> %r, <vscale x 2 x i64> splat (i64 32))
+ ret <vscale x 2 x i64> %or
+}
+
declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index 652617b58eaf3..6b4d6a5e3460c 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefixes=CHECK,SHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefixes=CHECK,NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefixes=CHECK,SVE2 %s
/* 128-bit vectors */
@@ -359,6 +359,120 @@ entry:
ret <8 x i8> %or
}
+; Don't use XAR if REV16/REV32/REV64 can be used.
+
+define <4 x i16> @rev16_v4i16(<4 x i16> %r) {
+; CHECK-LABEL: rev16_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: ret
+ %or = tail call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 8))
+ ret <4 x i16> %or
+}
+
+define <2 x i32> @rev32_v2i32(<2 x i32> %r) {
+; SHA3-LABEL: rev32_v2i32:
+; SHA3: // %bb.0:
+; SHA3-NEXT: rev32 v0.4h, v0.4h
+; SHA3-NEXT: ret
+;
+; NOSHA3-LABEL: rev32_v2i32:
+; NOSHA3: // %bb.0:
+; NOSHA3-NEXT: rev32 v0.4h, v0.4h
+; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: rev32_v2i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: xar z0.s, z0.s, z1.s, #16
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %or = tail call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 16))
+ ret <2 x i32> %or
+}
+
+define <1 x i64> @rev64_v1i64(<1 x i64> %r) {
+; SHA3-LABEL: rev64_v1i64:
+; SHA3: // %bb.0:
+; SHA3-NEXT: movi v1.2d, #0000000000000000
+; SHA3-NEXT: // kill: def $d0 killed $d0 def $q0
+; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #32
+; SHA3-NEXT: // kill: def $d0 killed $d0 killed $q0
+; SHA3-NEXT: ret
+;
+; NOSHA3-LABEL: rev64_v1i64:
+; NOSHA3: // %bb.0:
+; NOSHA3-NEXT: shl d1, d0, #32
+; NOSHA3-NEXT: usra d1, d0, #32
+; NOSHA3-NEXT: fmov d0, d1
+; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: rev64_v1i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
+; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT: ret
+ %or = tail call <1 x i64> @llvm.fshl(<1 x i64> %r, <1 x i64> %r, <1 x i64> splat (i64 32))
+ ret <1 x i64> %or
+}
+
+define <8 x i16> @rev16_v8i16(<8 x i16> %r) {
+; CHECK-LABEL: rev16_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: ret
+ %or = tail call <8 x i16> @llvm.fshl(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 8))
+ ret <8 x i16> %or
+}
+
+define <4 x i32> @rev32_v4i32(<4 x i32> %r) {
+; SHA3-LABEL: rev32_v4i32:
+; SHA3: // %bb.0:
+; SHA3-NEXT: rev32 v0.8h, v0.8h
+; SHA3-NEXT: ret
+;
+; NOSHA3-LABEL: rev32_v4i32:
+; NOSHA3: // %bb.0:
+; NOSHA3-NEXT: rev32 v0.8h, v0.8h
+; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: rev32_v4i32:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: xar z0.s, z0.s, z1.s, #16
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %or = tail call <4 x i32> @llvm.fshl(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 16))
+ ret <4 x i32> %or
+}
+
+define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
+; SHA3-LABEL: rev64_v2i64:
+; SHA3: // %bb.0:
+; SHA3-NEXT: movi v1.2d, #0000000000000000
+; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #32
+; SHA3-NEXT: ret
+;
+; NOSHA3-LABEL: rev64_v2i64:
+; NOSHA3: // %bb.0:
+; NOSHA3-NEXT: rev64 v0.4s, v0.4s
+; NOSHA3-NEXT: ret
+;
+; SVE2-LABEL: rev64_v2i64:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
+; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT: ret
+ %or = tail call <2 x i64> @llvm.fshl(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 32))
+ ret <2 x i64> %or
+}
+
declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
>From c5f7fb75ed33fcf9e959de6e7334bf19730f8037 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 29 Jan 2026 05:21:23 -0800
Subject: [PATCH 2/4] [AArch64] Avoid selecting XAR for reverse operations.
Rotations that implement reverse operations, for example:
```c
uint64x2_t foo(uint64x2_t r) {
return (r >> 32) | (r << 32);
}
```
Are currently lowered as XAR (when available):
```gas
foo:
movi v1.2d, #0000000000000000
xar v0.2d, v0.2d, v1.2d, #32
ret
```
This is subobtimal as REV* instructions typically have higher throughput
than XAR, and do not require the zero operand.
This patch combines half-rotations to Neon or SVE REV* instructions,
such that they're no longer selected as XAR.
https://godbolt.org/z/z9Y6xbr5W
---
.../Target/AArch64/AArch64ISelLowering.cpp | 94 +++++++++++++++++++
llvm/test/CodeGen/AArch64/sve2-xar.ll | 34 ++-----
llvm/test/CodeGen/AArch64/xar.ll | 90 ++++--------------
3 files changed, 120 insertions(+), 98 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 842655d0ca0e9..30aa3a81c897f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20771,6 +20771,97 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
}
+// Attempt to use REVs for half-rotations of vectors of i32 and i64.
+// Patterns for i32:
+//
+// (OR (SHL_PRED all-true, X, (splat 16)),
+// (SRL_PRED all-true, X, (splat 16)))
+// =>
+// REVH all-true, X, poison
+//
+// (OR (VSHL X, 16), (VLSHR X, 16))
+// =>
+// NVCAST (REV32 X)
+static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned EltSize = VT.getScalarSizeInBits();
+
+ // Half rotations of i16 vectors should be lowered to a bswap, so we shouldn't
+ // need custom code for them here.
+ if (EltSize != 32 && EltSize != 64)
+ return SDValue();
+
+ if (VT.isScalableVector()) {
+ if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
+ N1.getOpcode() != AArch64ISD::SRL_PRED)
+ return SDValue();
+
+ // Ensure we have common inputs.
+ if (N0.getOperand(0) != N1.getOperand(0) ||
+ N0.getOperand(1) != N1.getOperand(1) ||
+ N0.getOperand(2) != N1.getOperand(2))
+ return SDValue();
+
+ // Check for all-true predicate.
+ // NOTE: Since SHL_PRED and SRL_PRED are defined ``with the result of
+ // inactive lanes being unspecified'', this shouldn't be required.
+ SDValue Pg = N0.getOperand(0);
+ if (!isAllActivePredicate(DAG, Pg))
+ return SDValue();
+
+ APInt ShAmt;
+ if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShAmt) ||
+ EltSize / 2 != ShAmt)
+ return SDValue();
+
+ unsigned RevOp;
+ if (EltSize == 32)
+ RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
+ else if (EltSize == 64)
+ RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
+ else
+ llvm_unreachable("Unexpected element size");
+
+ return DAG.getNode(RevOp, SDLoc(N), VT, Pg, N0.getOperand(1),
+ DAG.getPOISON(VT));
+ }
+
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type");
+
+ if (N0.getOpcode() != AArch64ISD::VSHL || N1.getOpcode() != AArch64ISD::VLSHR)
+ return SDValue();
+
+ // Ensure common inputs.
+ if (N0.getOperand(0) != N1.getOperand(0) ||
+ N0.getOperand(1) != N1.getOperand(1))
+ return SDValue();
+
+ if (EltSize / 2 != N0.getConstantOperandVal(1))
+ return SDValue();
+
+ EVT HalfVT;
+ unsigned RevOp;
+ if (EltSize == 32) {
+ RevOp = AArch64ISD::REV32;
+ HalfVT = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+ } else if (EltSize == 64) {
+ RevOp = AArch64ISD::REV64;
+ HalfVT = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
+ } else
+ llvm_unreachable("Unexpected element size");
+
+ SDLoc DL(N);
+ return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
+ DAG.getNode(RevOp, DL, HalfVT, N0->getOperand(0)));
+}
+
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget,
const AArch64TargetLowering &TLI) {
@@ -20779,6 +20870,9 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (SDValue R = performANDORCSELCombine(N, DAG))
return R;
+ if (SDValue R = tryCombineToREV(N, DAG))
+ return R;
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index a6d6ae711db16..1324528d5da24 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -309,35 +309,21 @@ define <vscale x 8 x i16> @revb_nxv8i16(<vscale x 8 x i16> %r) {
}
define <vscale x 4 x i32> @revh_nxv4i32(<vscale x 4 x i32> %r) {
-; SVE-LABEL: revh_nxv4i32:
-; SVE: // %bb.0:
-; SVE-NEXT: lsr z1.s, z0.s, #16
-; SVE-NEXT: lsl z0.s, z0.s, #16
-; SVE-NEXT: orr z0.d, z0.d, z1.d
-; SVE-NEXT: ret
-;
-; SVE2-LABEL: revh_nxv4i32:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: xar z0.s, z0.s, z1.s, #16
-; SVE2-NEXT: ret
+; CHECK-LABEL: revh_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: revh z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
%or = tail call <vscale x 4 x i32> @llvm.fshl(<vscale x 4 x i32> %r, <vscale x 4 x i32> %r, <vscale x 4 x i32> splat (i32 16))
ret <vscale x 4 x i32> %or
}
define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) {
-; SVE-LABEL: revw_nx2i64:
-; SVE: // %bb.0:
-; SVE-NEXT: lsr z1.d, z0.d, #32
-; SVE-NEXT: lsl z0.d, z0.d, #32
-; SVE-NEXT: orr z0.d, z0.d, z1.d
-; SVE-NEXT: ret
-;
-; SVE2-LABEL: revw_nx2i64:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
-; SVE2-NEXT: ret
+; CHECK-LABEL: revw_nx2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: revw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
%or = tail call <vscale x 2 x i64> @llvm.fshl(<vscale x 2 x i64> %r, <vscale x 2 x i64> %r, <vscale x 2 x i64> splat (i64 32))
ret <vscale x 2 x i64> %or
}
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index 6b4d6a5e3460c..c2cca5c20bd2a 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -371,50 +371,19 @@ define <4 x i16> @rev16_v4i16(<4 x i16> %r) {
}
define <2 x i32> @rev32_v2i32(<2 x i32> %r) {
-; SHA3-LABEL: rev32_v2i32:
-; SHA3: // %bb.0:
-; SHA3-NEXT: rev32 v0.4h, v0.4h
-; SHA3-NEXT: ret
-;
-; NOSHA3-LABEL: rev32_v2i32:
-; NOSHA3: // %bb.0:
-; NOSHA3-NEXT: rev32 v0.4h, v0.4h
-; NOSHA3-NEXT: ret
-;
-; SVE2-LABEL: rev32_v2i32:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
-; SVE2-NEXT: xar z0.s, z0.s, z1.s, #16
-; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
-; SVE2-NEXT: ret
+; CHECK-LABEL: rev32_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.4h, v0.4h
+; CHECK-NEXT: ret
%or = tail call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 16))
ret <2 x i32> %or
}
define <1 x i64> @rev64_v1i64(<1 x i64> %r) {
-; SHA3-LABEL: rev64_v1i64:
-; SHA3: // %bb.0:
-; SHA3-NEXT: movi v1.2d, #0000000000000000
-; SHA3-NEXT: // kill: def $d0 killed $d0 def $q0
-; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #32
-; SHA3-NEXT: // kill: def $d0 killed $d0 killed $q0
-; SHA3-NEXT: ret
-;
-; NOSHA3-LABEL: rev64_v1i64:
-; NOSHA3: // %bb.0:
-; NOSHA3-NEXT: shl d1, d0, #32
-; NOSHA3-NEXT: usra d1, d0, #32
-; NOSHA3-NEXT: fmov d0, d1
-; NOSHA3-NEXT: ret
-;
-; SVE2-LABEL: rev64_v1i64:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0
-; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
-; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
-; SVE2-NEXT: ret
+; CHECK-LABEL: rev64_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.2s, v0.2s
+; CHECK-NEXT: ret
%or = tail call <1 x i64> @llvm.fshl(<1 x i64> %r, <1 x i64> %r, <1 x i64> splat (i64 32))
ret <1 x i64> %or
}
@@ -429,46 +398,19 @@ define <8 x i16> @rev16_v8i16(<8 x i16> %r) {
}
define <4 x i32> @rev32_v4i32(<4 x i32> %r) {
-; SHA3-LABEL: rev32_v4i32:
-; SHA3: // %bb.0:
-; SHA3-NEXT: rev32 v0.8h, v0.8h
-; SHA3-NEXT: ret
-;
-; NOSHA3-LABEL: rev32_v4i32:
-; NOSHA3: // %bb.0:
-; NOSHA3-NEXT: rev32 v0.8h, v0.8h
-; NOSHA3-NEXT: ret
-;
-; SVE2-LABEL: rev32_v4i32:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
-; SVE2-NEXT: xar z0.s, z0.s, z1.s, #16
-; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
-; SVE2-NEXT: ret
+; CHECK-LABEL: rev32_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.8h, v0.8h
+; CHECK-NEXT: ret
%or = tail call <4 x i32> @llvm.fshl(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 16))
ret <4 x i32> %or
}
define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
-; SHA3-LABEL: rev64_v2i64:
-; SHA3: // %bb.0:
-; SHA3-NEXT: movi v1.2d, #0000000000000000
-; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #32
-; SHA3-NEXT: ret
-;
-; NOSHA3-LABEL: rev64_v2i64:
-; NOSHA3: // %bb.0:
-; NOSHA3-NEXT: rev64 v0.4s, v0.4s
-; NOSHA3-NEXT: ret
-;
-; SVE2-LABEL: rev64_v2i64:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0
-; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
-; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0
-; SVE2-NEXT: ret
+; CHECK-LABEL: rev64_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.4s, v0.4s
+; CHECK-NEXT: ret
%or = tail call <2 x i64> @llvm.fshl(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 32))
ret <2 x i64> %or
}
>From e77f67fb46999f4e537a9f27e8ae6d9170d1848b Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 9 Feb 2026 03:07:55 -0800
Subject: [PATCH 3/4] Add more tests.
---
llvm/test/CodeGen/AArch64/sve2-xar.ll | 46 +++++++++++++++++++++++++++
llvm/test/CodeGen/AArch64/xar.ll | 36 +++++++++++++++++++++
2 files changed, 82 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index 1324528d5da24..a92cfdd4002a0 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -328,6 +328,52 @@ define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) {
ret <vscale x 2 x i64> %or
}
+; As above, one test with rotate right.
+define <vscale x 2 x i64> @revw_nx2i64_r(<vscale x 2 x i64> %a) {
+; SVE-LABEL: revw_nx2i64_r:
+; SVE: // %bb.0:
+; SVE-NEXT: lsl z1.d, z0.d, #32
+; SVE-NEXT: lsr z0.d, z0.d, #32
+; SVE-NEXT: orr z0.d, z0.d, z1.d
+; SVE-NEXT: ret
+;
+; SVE2-LABEL: revw_nx2i64_r:
+; SVE2: // %bb.0:
+; SVE2-NEXT: movi v1.2d, #0000000000000000
+; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
+; SVE2-NEXT: ret
+ %r = tail call <vscale x 2 x i64> @llvm.fshr(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 32))
+ ret <vscale x 2 x i64> %r
+}
+
+; As above, one test with predicated shifts instead of rotate left.
+define <vscale x 4 x i32> @revh_nx4i32_shifts_l(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: revh_nx4i32_shifts_l:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl z1.s, z0.s, #16
+; CHECK-NEXT: lsr z0.s, z0.s, #16
+; CHECK-NEXT: orr z0.d, z1.d, z0.d
+; CHECK-NEXT: ret
+ %shl = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 16))
+ %shr = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 16))
+ %or = or <vscale x 4 x i32> %shl, %shr
+ ret <vscale x 4 x i32> %or
+}
+
+; As above, one test with predicated shifts instead of rotate right.
+define <vscale x 8 x i16> @revb_nx8i16_shifts_r(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: revb_nx8i16_shifts_r:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsr z1.h, z0.h, #8
+; CHECK-NEXT: lsl z0.h, z0.h, #8
+; CHECK-NEXT: orr z0.d, z1.d, z0.d
+; CHECK-NEXT: ret
+ %shr = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.u(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 8))
+ %shl = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.u(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 8))
+ %or = or <vscale x 8 x i16> %shr, %shl
+ ret <vscale x 8 x i16> %or
+}
+
declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index c2cca5c20bd2a..95e48a1015f33 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -415,6 +415,42 @@ define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
ret <2 x i64> %or
}
+; As above, one test with rotate right.
+define <1 x i64> @rev64_v1i64_r(<1 x i64> %a) {
+; CHECK-LABEL: rev64_v1i64_r:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl d1, d0, #32
+; CHECK-NEXT: usra d1, d0, #32
+; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: ret
+ %r = tail call <1 x i64> @llvm.fshr(<1 x i64> %a, <1 x i64> %a, <1 x i64> splat (i64 32))
+ ret <1 x i64> %r
+}
+
+; As above, one test with individual shifts instead of rotate left.
+define <2 x i32> @rev32_v2i32_shifts_l(<2 x i32> %a) {
+; CHECK-LABEL: rev32_v2i32_shifts_l:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.4h, v0.4h
+; CHECK-NEXT: ret
+ %shl = shl <2 x i32> %a, splat (i32 16)
+ %shr = lshr <2 x i32> %a, splat (i32 16)
+ %or = or <2 x i32> %shl, %shr
+ ret <2 x i32> %or
+}
+
+; As above, one test with individual shifts instead of rotate right.
+define <4 x i16> @rev16_v4i16_shifts_r(<4 x i16> %a) {
+; CHECK-LABEL: rev16_v4i16_shifts_r:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: ret
+ %shr = lshr <4 x i16> %a, splat (i16 8)
+ %shl = shl <4 x i16> %a, splat (i16 8)
+ %or = or <4 x i16> %shr, %shl
+ ret <4 x i16> %or
+}
+
declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
>From ce19ae17abedd0a4d47ddabe70c5ac9d31f60b3c Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 6 Feb 2026 09:46:36 -0800
Subject: [PATCH 4/4] Remove all-true check and exit BeforeLegalizeOps.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 58 ++++++++++---------
llvm/test/CodeGen/AArch64/sve2-xar.ll | 25 +++-----
llvm/test/CodeGen/AArch64/xar.ll | 4 +-
3 files changed, 40 insertions(+), 47 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 30aa3a81c897f..7a225b7666039 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20771,34 +20771,39 @@ static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
}
-// Attempt to use REVs for half-rotations of vectors of i32 and i64.
+// Attempt to use REVs for half-rotations of vectors of i16, i32 and i64.
// Patterns for i32:
//
-// (OR (SHL_PRED all-true, X, (splat 16)),
-// (SRL_PRED all-true, X, (splat 16)))
+// (OR (SHL_PRED Pg, X, (splat 16)),
+// (SRL_PRED Pg, X, (splat 16)))
// =>
-// REVH all-true, X, poison
+// REVH Pg, X, poison
//
// (OR (VSHL X, 16), (VLSHR X, 16))
// =>
// NVCAST (REV32 X)
-static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG) {
+static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
unsigned EltSize = VT.getScalarSizeInBits();
-
- // Half rotations of i16 vectors should be lowered to a bswap, so we shouldn't
- // need custom code for them here.
- if (EltSize != 32 && EltSize != 64)
+ if (EltSize != 16 && EltSize != 32 && EltSize != 64)
return SDValue();
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
if (VT.isScalableVector()) {
+ if (N0.getOpcode() == AArch64ISD::SRL_PRED)
+ std::swap(N0, N1);
if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
N1.getOpcode() != AArch64ISD::SRL_PRED)
return SDValue();
@@ -20809,32 +20814,35 @@ static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG) {
N0.getOperand(2) != N1.getOperand(2))
return SDValue();
- // Check for all-true predicate.
- // NOTE: Since SHL_PRED and SRL_PRED are defined ``with the result of
- // inactive lanes being unspecified'', this shouldn't be required.
- SDValue Pg = N0.getOperand(0);
- if (!isAllActivePredicate(DAG, Pg))
- return SDValue();
-
APInt ShAmt;
if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShAmt) ||
EltSize / 2 != ShAmt)
return SDValue();
unsigned RevOp;
- if (EltSize == 32)
+ if (EltSize == 16)
+ RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
+ else if (EltSize == 32)
RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
else if (EltSize == 64)
RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
- else
- llvm_unreachable("Unexpected element size");
- return DAG.getNode(RevOp, SDLoc(N), VT, Pg, N0.getOperand(1),
+ return DAG.getNode(RevOp, DL, VT, N0.getOperand(0), N0.getOperand(1),
DAG.getPOISON(VT));
}
assert(VT.isFixedLengthVector() && "Expected fixed length vector type");
+ // Half rotations of i16 vectors should be combined to bswap, so we shouldn't
+ // need custom code for them here.
+ // Note: This doesn't apply to scalable vectors as we allow arbitrary (but
+ // matching) predicates in the shifts. Predicated rotations aren't matched to
+ // rotl / rotr, and subsequently aren't combined to bswap.
+ if (EltSize == 16)
+ return SDValue();
+
+ if (N0.getOpcode() == AArch64ISD::VLSHR)
+ std::swap(N0, N1);
if (N0.getOpcode() != AArch64ISD::VSHL || N1.getOpcode() != AArch64ISD::VLSHR)
return SDValue();
@@ -20854,10 +20862,8 @@ static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG) {
} else if (EltSize == 64) {
RevOp = AArch64ISD::REV64;
HalfVT = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
- } else
- llvm_unreachable("Unexpected element size");
+ }
- SDLoc DL(N);
return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
DAG.getNode(RevOp, DL, HalfVT, N0->getOperand(0)));
}
@@ -20870,7 +20876,7 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
if (SDValue R = performANDORCSELCombine(N, DAG))
return R;
- if (SDValue R = tryCombineToREV(N, DAG))
+ if (SDValue R = tryCombineToREV(N, DAG, DCI))
return R;
return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index a92cfdd4002a0..e72745e551dbb 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -330,18 +330,11 @@ define <vscale x 2 x i64> @revw_nx2i64(<vscale x 2 x i64> %r) {
; As above, one test with rotate right.
define <vscale x 2 x i64> @revw_nx2i64_r(<vscale x 2 x i64> %a) {
-; SVE-LABEL: revw_nx2i64_r:
-; SVE: // %bb.0:
-; SVE-NEXT: lsl z1.d, z0.d, #32
-; SVE-NEXT: lsr z0.d, z0.d, #32
-; SVE-NEXT: orr z0.d, z0.d, z1.d
-; SVE-NEXT: ret
-;
-; SVE2-LABEL: revw_nx2i64_r:
-; SVE2: // %bb.0:
-; SVE2-NEXT: movi v1.2d, #0000000000000000
-; SVE2-NEXT: xar z0.d, z0.d, z1.d, #32
-; SVE2-NEXT: ret
+; CHECK-LABEL: revw_nx2i64_r:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: revw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
%r = tail call <vscale x 2 x i64> @llvm.fshr(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 32))
ret <vscale x 2 x i64> %r
}
@@ -350,9 +343,7 @@ define <vscale x 2 x i64> @revw_nx2i64_r(<vscale x 2 x i64> %a) {
define <vscale x 4 x i32> @revh_nx4i32_shifts_l(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
; CHECK-LABEL: revh_nx4i32_shifts_l:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl z1.s, z0.s, #16
-; CHECK-NEXT: lsr z0.s, z0.s, #16
-; CHECK-NEXT: orr z0.d, z1.d, z0.d
+; CHECK-NEXT: revh z0.s, p0/m, z0.s
; CHECK-NEXT: ret
%shl = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 16))
%shr = tail call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.u(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 16))
@@ -364,9 +355,7 @@ define <vscale x 4 x i32> @revh_nx4i32_shifts_l(<vscale x 4 x i1> %pg, <vscale x
define <vscale x 8 x i16> @revb_nx8i16_shifts_r(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
; CHECK-LABEL: revb_nx8i16_shifts_r:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsr z1.h, z0.h, #8
-; CHECK-NEXT: lsl z0.h, z0.h, #8
-; CHECK-NEXT: orr z0.d, z1.d, z0.d
+; CHECK-NEXT: revb z0.h, p0/m, z0.h
; CHECK-NEXT: ret
%shr = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.u(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 8))
%shl = tail call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.u(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 8))
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index 95e48a1015f33..4f940a39dfe39 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -419,9 +419,7 @@ define <2 x i64> @rev64_v2i64(<2 x i64> %r) {
define <1 x i64> @rev64_v1i64_r(<1 x i64> %a) {
; CHECK-LABEL: rev64_v1i64_r:
; CHECK: // %bb.0:
-; CHECK-NEXT: shl d1, d0, #32
-; CHECK-NEXT: usra d1, d0, #32
-; CHECK-NEXT: fmov d0, d1
+; CHECK-NEXT: rev64 v0.2s, v0.2s
; CHECK-NEXT: ret
%r = tail call <1 x i64> @llvm.fshr(<1 x i64> %a, <1 x i64> %a, <1 x i64> splat (i64 32))
ret <1 x i64> %r
More information about the llvm-commits
mailing list