[clang] a619a2e - [ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets (#127068)
via cfe-commits
cfe-commits at lists.llvm.org
Tue Mar 4 00:10:25 PST 2025
Author: Oliver Stannard
Date: 2025-03-04T08:10:22Z
New Revision: a619a2e53a9ba09ba18a047b8389bf4dd1912b72
URL: https://github.com/llvm/llvm-project/commit/a619a2e53a9ba09ba18a047b8389bf4dd1912b72
DIFF: https://github.com/llvm/llvm-project/commit/a619a2e53a9ba09ba18a047b8389bf4dd1912b72.diff
LOG: [ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets (#127068)
In arm-neon.h, we insert shufflevectors around each intrinsic when the
target is big-endian, to compensate for the difference between the
ABI-defined memory format of vectors (with the whole vector stored as
one big-endian access) and LLVM's target-independent expectations (with
the lowest-numbered lane in the lowest address). However, this code was
written for the AArch64 ABI, and the AArch32 ABI differs slightly: it
requires that vectors are stored in memory as-if stored with VSTM, which
does a series of 64-bit accesses, instead of the AArch64 VSTR, which
does a single 128-bit access. This means that for AArch32 we need to
reverse the lanes in each 64-bit chunk of the vector, instead of in the
whole vector.
Since there are only a small number of different shufflevector orderings
needed, I've split them out into macros, so that this doesn't need
separate conditions in each intrinsic definition.
Added:
clang/test/CodeGen/arm-neon-endianness.c
Modified:
clang/utils/TableGen/NeonEmitter.cpp
Removed:
################################################################################
diff --git a/clang/test/CodeGen/arm-neon-endianness.c b/clang/test/CodeGen/arm-neon-endianness.c
new file mode 100644
index 0000000000000..ba2471ee39d3e
--- /dev/null
+++ b/clang/test/CodeGen/arm-neon-endianness.c
@@ -0,0 +1,115 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// REQUIRES: arm-registered-target
+
+// RUN: %clang_cc1 -triple armv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
+// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=LE
+// RUN: %clang_cc1 -triple armebv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
+// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=BE
+
+#include <arm_neon.h>
+
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_0(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
+// LE-NEXT: ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_0(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
+// BE-NEXT: ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_0(int32x4_t a) { return vgetq_lane_s32(a, 0); }
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_1(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
+// LE-NEXT: ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_1(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
+// BE-NEXT: ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_1(int32x4_t a) { return vgetq_lane_s32(a, 1); }
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_2(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
+// LE-NEXT: ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_2(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
+// BE-NEXT: ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_2(int32x4_t a) { return vgetq_lane_s32(a, 2); }
+// LE-LABEL: define dso_local i32 @int32x4_t_lane_3(
+// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
+// LE-NEXT: ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x4_t_lane_3(
+// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
+// BE-NEXT: ret i32 [[VGET_LANE]]
+//
+int int32x4_t_lane_3(int32x4_t a) { return vgetq_lane_s32(a, 3); }
+// LE-LABEL: define dso_local i32 @int32x2_t_lane_0(
+// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
+// LE-NEXT: ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x2_t_lane_0(
+// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
+// BE-NEXT: ret i32 [[VGET_LANE]]
+//
+int int32x2_t_lane_0(int32x2_t a) { return vget_lane_s32(a, 0); }
+// LE-LABEL: define dso_local i32 @int32x2_t_lane_1(
+// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
+// LE-NEXT: ret i32 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i32 @int32x2_t_lane_1(
+// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
+// BE-NEXT: ret i32 [[VGET_LANE]]
+//
+int int32x2_t_lane_1(int32x2_t a) { return vget_lane_s32(a, 1); }
+// LE-LABEL: define dso_local i64 @int64x2_t_lane_0(
+// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
+// LE-NEXT: ret i64 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i64 @int64x2_t_lane_0(
+// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
+// BE-NEXT: ret i64 [[VGET_LANE]]
+//
+int64_t int64x2_t_lane_0(int64x2_t a) { return vgetq_lane_s64(a, 0); }
+// LE-LABEL: define dso_local i64 @int64x2_t_lane_1(
+// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// LE-NEXT: [[ENTRY:.*:]]
+// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
+// LE-NEXT: ret i64 [[VGET_LANE]]
+//
+// BE-LABEL: define dso_local i64 @int64x2_t_lane_1(
+// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// BE-NEXT: [[ENTRY:.*:]]
+// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
+// BE-NEXT: ret i64 [[VGET_LANE]]
+//
+int64_t int64x2_t_lane_1(int64x2_t a) { return vgetq_lane_s64(a, 1); }
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index a18f78697af1c..5669b5e329587 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -1263,20 +1263,17 @@ void Intrinsic::emitReverseVariable(Variable &Dest, Variable &Src) {
for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) {
OS << " " << Dest.getName() << ".val[" << K << "] = "
- << "__builtin_shufflevector("
- << Src.getName() << ".val[" << K << "], "
- << Src.getName() << ".val[" << K << "]";
- for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
- OS << ", " << J;
- OS << ");";
+ << "__builtin_shufflevector(" << Src.getName() << ".val[" << K << "], "
+ << Src.getName() << ".val[" << K << "], __lane_reverse_"
+ << Dest.getType().getSizeInBits() << "_"
+ << Dest.getType().getElementSizeInBits() << ");";
emitNewLine();
}
} else {
- OS << " " << Dest.getName()
- << " = __builtin_shufflevector(" << Src.getName() << ", " << Src.getName();
- for (int J = Dest.getType().getNumElements() - 1; J >= 0; --J)
- OS << ", " << J;
- OS << ");";
+ OS << " " << Dest.getName() << " = __builtin_shufflevector("
+ << Src.getName() << ", " << Src.getName() << ", __lane_reverse_"
+ << Dest.getType().getSizeInBits() << "_"
+ << Dest.getType().getElementSizeInBits() << ");";
emitNewLine();
}
}
@@ -1877,10 +1874,11 @@ std::string Intrinsic::generate() {
OS << "#else\n";
- // Big endian intrinsics are more complex. The user intended these
- // intrinsics to operate on a vector "as-if" loaded by (V)LDR,
- // but we load as-if (V)LD1. So we should swap all arguments and
- // swap the return value too.
+ // Big endian intrinsics are more complex. The user intended these intrinsics
+ // to operate on a vector "as-if" loaded by LDR (for AArch64), VLDR (for
+ // 64-bit vectors on AArch32), or VLDM (for 128-bit vectors on AArch32) but
+ // we load as-if LD1 (for AArch64) or VLD1 (for AArch32). So we should swap
+ // all arguments and swap the return value too.
//
// If we call sub-intrinsics, we should call a version that does
// not re-swap the arguments!
@@ -2434,6 +2432,31 @@ void NeonEmitter::run(raw_ostream &OS) {
OS << "#define __ai static __inline__ __attribute__((__always_inline__, "
"__nodebug__))\n\n";
+ // Shufflevector arguments lists for endian-swapping vectors for big-endian
+ // targets. For AArch64, we need to reverse every lane in the vector, but for
+ // AArch32 we need to reverse the lanes within each 64-bit chunk of the
+ // vector. The naming convention here is __lane_reverse_<n>_<m>, where <n> is
+ // the length of the vector in bits, and <m> is length of each lane in bits.
+ OS << "#if !defined(__LITTLE_ENDIAN__)\n";
+ OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
+ OS << "#define __lane_reverse_64_32 1,0\n";
+ OS << "#define __lane_reverse_64_16 3,2,1,0\n";
+ OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
+ OS << "#define __lane_reverse_128_64 1,0\n";
+ OS << "#define __lane_reverse_128_32 3,2,1,0\n";
+ OS << "#define __lane_reverse_128_16 7,6,5,4,3,2,1,0\n";
+ OS << "#define __lane_reverse_128_8 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0\n";
+ OS << "#else\n";
+ OS << "#define __lane_reverse_64_32 1,0\n";
+ OS << "#define __lane_reverse_64_16 3,2,1,0\n";
+ OS << "#define __lane_reverse_64_8 7,6,5,4,3,2,1,0\n";
+ OS << "#define __lane_reverse_128_64 0,1\n";
+ OS << "#define __lane_reverse_128_32 1,0,3,2\n";
+ OS << "#define __lane_reverse_128_16 3,2,1,0,7,6,5,4\n";
+ OS << "#define __lane_reverse_128_8 7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8\n";
+ OS << "#endif\n";
+ OS << "#endif\n";
+
SmallVector<Intrinsic *, 128> Defs;
for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
createIntrinsic(R, Defs);
More information about the cfe-commits
mailing list