[llvm-branch-commits] [llvm] release/22.x: [AArch64] Fix `shufflevector` miscompilation on `aarch64_be` (#193076) (PR #193744)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon May 4 09:59:31 PDT 2026
https://github.com/dyung updated https://github.com/llvm/llvm-project/pull/193744
>From b7235a9f538afad5cdfa1f472bc608fb186cb574 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Thu, 23 Apr 2026 11:27:17 +0200
Subject: [PATCH] [AArch64] Fix `shufflevector` miscompilation on `aarch64_be`
(#193076)
A function like
```llvm
define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) {
entry:
%r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
ret <4 x i16> %r
}
```
will use the `xtn` instruction, which for each 32-bit vector element
keeps only the lower 16 bits, so effectively this is a truncation.
However, if the vector actually has 16-bit elements, then the conversion
from a shuffle to a truncation is only valid on LE, not on BE. On BE,
`uzp1` should be used instead. So this PR moves some logic to right
after a check for LE, so that BE does not miscompile.
I filed this as an issue on `rust-lang-rust` as
https://github.com/rust-lang/rust/issues/155459, and we had some
discussion at [#t-libs/stdarch > `simd_shuffle!` behaves
strangely on
`aarch64_be`](https://rust-lang.zulipchat.com/#narrow/channel/208962-t-libs.2Fstdarch/topic/.60simd_shuffle.21.60.20behaves.20strangely.20on.20.60aarch64_be.60/with/587766433).
---
.../Target/AArch64/AArch64ISelLowering.cpp | 54 +++++------
.../AArch64/aarch64_be-shuffle-vector.ll | 95 +++++++++++++++++++
.../AArch64/fix-shuffle-vector-be-rev.ll | 39 --------
3 files changed, 122 insertions(+), 66 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
delete mode 100644 llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a55b1facb103f..d8398719acebc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24419,33 +24419,6 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
if (N->getOpcode() == AArch64ISD::UZP2)
return SDValue();
- // uzp1(x, undef) -> concat(truncate(x), undef)
- if (Op1.isUndef()) {
- EVT BCVT = MVT::Other, HalfVT = MVT::Other;
- switch (ResVT.getSimpleVT().SimpleTy) {
- default:
- break;
- case MVT::v16i8:
- BCVT = MVT::v8i16;
- HalfVT = MVT::v8i8;
- break;
- case MVT::v8i16:
- BCVT = MVT::v4i32;
- HalfVT = MVT::v4i16;
- break;
- case MVT::v4i32:
- BCVT = MVT::v2i64;
- HalfVT = MVT::v2i32;
- break;
- }
- if (BCVT != MVT::Other) {
- SDValue BC = DAG.getBitcast(BCVT, Op0);
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
- DAG.getUNDEF(HalfVT));
- }
- }
-
if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
return Urshr;
@@ -24487,6 +24460,33 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
if (!DAG.getDataLayout().isLittleEndian())
return SDValue();
+ // uzp1(x, undef) -> concat(truncate(x), undef)
+ if (Op1.isUndef()) {
+ EVT BCVT = MVT::Other, HalfVT = MVT::Other;
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ default:
+ break;
+ case MVT::v16i8:
+ BCVT = MVT::v8i16;
+ HalfVT = MVT::v8i8;
+ break;
+ case MVT::v8i16:
+ BCVT = MVT::v4i32;
+ HalfVT = MVT::v4i16;
+ break;
+ case MVT::v4i32:
+ BCVT = MVT::v2i64;
+ HalfVT = MVT::v2i32;
+ break;
+ }
+ if (BCVT != MVT::Other) {
+ SDValue BC = DAG.getBitcast(BCVT, Op0);
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
+ DAG.getUNDEF(HalfVT));
+ }
+ }
+
// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
// Example:
// nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
diff --git a/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
new file mode 100644
index 0000000000000..4e60d99dbed36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64_be-shuffle-vector.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
+; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
+
+define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECKLE-LABEL: test_reconstructshuffle:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: mov b2, v0.b[3]
+; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
+; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
+; CHECKLE-NEXT: mov v2.b[6], v0.b[0]
+; CHECKLE-NEXT: zip2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
+; CHECKLE-NEXT: bic v0.4h, #255, lsl #8
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: test_reconstructshuffle:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.16b, v0.16b
+; CHECKBE-NEXT: rev64 v1.16b, v1.16b
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT: mov b2, v0.b[3]
+; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
+; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
+; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
+; CHECKBE-NEXT: mov v2.b[6], v0.b[0]
+; CHECKBE-NEXT: zip2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
+; CHECKBE-NEXT: bic v0.4h, #255, lsl #8
+; CHECKBE-NEXT: rev64 v0.4h, v0.4h
+; CHECKBE-NEXT: ret
+ %tmp1 = shufflevector <16 x i8> %a, <16 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %tmp2 = shufflevector <16 x i8> %b, <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+ %tmp3 = add <4 x i8> %tmp1, %tmp2
+ %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <8 x i8> @xtn_shuffle_even_v16i8(<16 x i8> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: xtn v0.8b, v0.8h
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v16i8:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.16b, v0.16b
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: uzp1 v0.16b, v0.16b, v0.16b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: ret
+ %r = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ ret <8 x i8> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i16> @xtn_shuffle_even_v8i16(<8 x i16> %a) {
+; CHECKLE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: xtn v0.4h, v0.4s
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: xtn_shuffle_even_v8i16:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.8h, v0.8h
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: uzp1 v0.8h, v0.8h, v0.8h
+; CHECKBE-NEXT: rev64 v0.4h, v0.4h
+; CHECKBE-NEXT: ret
+ %r = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ ret <4 x i16> %r
+}
+
+; a shufflevector of even elements can become just xtn on little-endian, but not on big-endian.
+define <4 x i32> @xtn_shuffle_uzp1_poison_v4i32(<4 x i32> %a) {
+; CHECKLE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: xtn v0.2s, v0.2d
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: xtn_shuffle_uzp1_poison_v4i32:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: rev64 v0.4s, v0.4s
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: uzp1 v0.4s, v0.4s, v0.4s
+; CHECKBE-NEXT: rev64 v0.4s, v0.4s
+; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
+; CHECKBE-NEXT: ret
+ %r = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+ ret <4 x i32> %r
+
+}
diff --git a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll b/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
deleted file mode 100644
index 65da95e0163f4..0000000000000
--- a/llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefix=CHECKLE
-; RUN: llc < %s -mtriple=aarch64_be | FileCheck %s --check-prefix=CHECKBE
-
-define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
-; CHECKLE-LABEL: test_reconstructshuffle:
-; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: mov b2, v0.b[3]
-; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
-; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
-; CHECKLE-NEXT: mov v2.b[6], v0.b[0]
-; CHECKLE-NEXT: zip2 v0.8b, v1.8b, v0.8b
-; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
-; CHECKLE-NEXT: bic v0.4h, #255, lsl #8
-; CHECKLE-NEXT: ret
-;
-; CHECKBE-LABEL: test_reconstructshuffle:
-; CHECKBE: // %bb.0:
-; CHECKBE-NEXT: rev64 v0.16b, v0.16b
-; CHECKBE-NEXT: rev64 v1.16b, v1.16b
-; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
-; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT: mov b2, v0.b[3]
-; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
-; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
-; CHECKBE-NEXT: mov v2.b[6], v0.b[0]
-; CHECKBE-NEXT: zip2 v0.8b, v1.8b, v0.8b
-; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
-; CHECKBE-NEXT: bic v0.4h, #255, lsl #8
-; CHECKBE-NEXT: rev64 v0.4h, v0.4h
-; CHECKBE-NEXT: ret
- %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- %tmp2 = shufflevector <16 x i8> %b, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
- %tmp3 = add <4 x i8> %tmp1, %tmp2
- %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
- ret <4 x i16> %tmp4
-}
More information about the llvm-branch-commits
mailing list