[llvm] [AArch64] recognise trn1/trn2 with flipped operands (PR #169858)
Philip Ginsbach-Chen via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 3 11:33:27 PST 2025
https://github.com/ginsbach updated https://github.com/llvm/llvm-project/pull/169858
>From 428d46377d049580c172aa1022a2fb70a7cad385 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Mon, 10 Nov 2025 21:18:40 +0000
Subject: [PATCH 1/5] [AArch64] recognise trn1/trn2 with flipped operands
---
.../Target/AArch64/AArch64ISelLowering.cpp | 14 +-
.../Target/AArch64/AArch64PerfectShuffle.h | 51 +++--
.../GISel/AArch64PostLegalizerLowering.cpp | 7 +-
llvm/test/CodeGen/AArch64/arm64-trn.ll | 81 ++++++++
.../AArch64/fixed-vector-deinterleave.ll | 8 +-
llvm/test/CodeGen/AArch64/reduce-shuffle.ll | 185 +++++++++---------
6 files changed, 224 insertions(+), 122 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d379a28ea5523..b06b3066831a9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14815,9 +14815,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
}
- if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
+ if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
- return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
+ return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
+ OperandOrder == 0 ? V2 : V1);
}
if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
@@ -16529,7 +16530,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
isREVMask(M, EltSize, NumElts, 16) ||
isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
isSingletonEXTMask(M, VT, DummyUnsigned) ||
- isTRNMask(M, NumElts, DummyUnsigned) ||
+ isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
isUZPMask(M, NumElts, DummyUnsigned) ||
isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
@@ -31588,10 +31589,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
OperandOrder == 0 ? Op1 : Op2,
OperandOrder == 0 ? Op2 : Op1));
- if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
+ if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
+ OperandOrder)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+ DAG, VT,
+ DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
+ OperandOrder == 0 ? Op2 : Op1));
}
if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index ef8786d0ad0e1..e6fa2ae7265f5 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6699,33 +6699,52 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
}
/// Return true for trn1 or trn2 masks of the form:
-/// <0, 8, 2, 10, 4, 12, 6, 14> or
-/// <1, 9, 3, 11, 5, 13, 7, 15>
+/// <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0, OperandOrderOut = 0) or
+/// <1, 9, 3, 11, 5, 13, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or
+/// <8, 0, 10, 2, 12, 4, 14, 6> (WhichResultOut = 0, OperandOrderOut = 1) or
+/// <9, 1, 11, 3, 13, 5, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) or
inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
- unsigned &WhichResultOut) {
+ unsigned &WhichResultOut, unsigned &OperandOrderOut) {
if (NumElts % 2 != 0)
return false;
- // Check the first non-undef element for trn1 vs trn2.
- unsigned WhichResult = 2;
+
+ // "Variant" refers to the distinction bwetween trn1 and trn2, while
+ // "Order" refers to sequence of input registers (matching vs flipped).
+ bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
+ bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
+ bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
+ bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
+ // Check all elements match.
for (unsigned i = 0; i != NumElts; i += 2) {
if (M[i] >= 0) {
- WhichResult = ((unsigned)M[i] == i ? 0 : 1);
- break;
+ unsigned EvenElt = (unsigned)M[i];
+ if (EvenElt != i)
+ Variant0Order0 = false;
+ if (EvenElt != i + 1)
+ Variant1Order0 = false;
+ if (EvenElt != NumElts + i)
+ Variant0Order1 = false;
+ if (EvenElt != NumElts + i + 1)
+ Variant1Order1 = false;
}
if (M[i + 1] >= 0) {
- WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
- break;
+ unsigned OddElt = (unsigned)M[i + 1];
+ if (OddElt != NumElts + i)
+ Variant0Order0 = false;
+ if (OddElt != NumElts + i + 1)
+ Variant1Order0 = false;
+ if (OddElt != i)
+ Variant0Order1 = false;
+ if (OddElt != i + 1)
+ Variant1Order1 = false;
}
}
- if (WhichResult == 2)
+
+ if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1)
return false;
- for (unsigned i = 0; i < NumElts; i += 2) {
- if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
- (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
- return false;
- }
- WhichResultOut = WhichResult;
+ WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1;
+ OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1;
return true;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4fba593b3d0fb..221a7bcd881bb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -215,14 +215,15 @@ bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
ShuffleVectorPseudo &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
unsigned WhichResult;
+ unsigned OperandOrder;
ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
Register Dst = MI.getOperand(0).getReg();
unsigned NumElts = MRI.getType(Dst).getNumElements();
- if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+ if (!isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder))
return false;
unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
- Register V1 = MI.getOperand(1).getReg();
- Register V2 = MI.getOperand(2).getReg();
+ Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg();
+ Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg();
MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
return true;
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll
index fe245d01a7a6d..9c1c614551bdb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-trn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll
@@ -246,6 +246,87 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
ret <4 x float> %tmp5
}
+define <8 x i8> @vtrni8_8first(ptr %A, ptr %B) nounwind {
+; CHECKLE-LABEL: vtrni8_8first:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: ldr d0, [x0]
+; CHECKLE-NEXT: ldr d1, [x1]
+; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b
+; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: vtrni8_8first:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
+; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
+; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b
+; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: ret
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <8 x i8> @vtrni8_9first(ptr %A, ptr %B) nounwind {
+; CHECKLE-LABEL: vtrni8_9first:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: ldr d0, [x0]
+; CHECKLE-NEXT: ldr d1, [x1]
+; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: vtrni8_9first:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
+; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
+; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: ret
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <8 x i8> @vtrni8_89first_undef(ptr %A, ptr %B) nounwind {
+; CHECKLE-LABEL: vtrni8_89first_undef:
+; CHECKLE: // %bb.0:
+; CHECKLE-NEXT: ldr d0, [x0]
+; CHECKLE-NEXT: ldr d1, [x1]
+; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT: ret
+;
+; CHECKBE-LABEL: vtrni8_89first_undef:
+; CHECKBE: // %bb.0:
+; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
+; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
+; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: ret
+ %tmp1 = load <8 x i8>, ptr %A
+ %tmp2 = load <8 x i8>, ptr %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 9, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 282e0503dd7be..8e75d69be5062 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -6,12 +6,10 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: dup v2.2s, v0.s[1]
-; CHECK-SD-NEXT: mov v1.16b, v2.16b
-; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v2.4h
-; CHECK-SD-NEXT: mov v1.h[0], v0.h[1]
+; CHECK-SD-NEXT: dup v1.2s, v0.s[1]
+; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: trn2 v1.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: fmov d0, d2
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 072f6f4e8f73e..39beffcf85783 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -36,93 +36,93 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: zip1 v5.4s, v3.4s, v0.4s
; CHECK-NEXT: trn1 v6.4s, v3.4s, v0.4s
; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #12
-; CHECK-NEXT: zip2 v17.4s, v1.4s, v2.4s
-; CHECK-NEXT: zip2 v7.4s, v2.4s, v1.4s
-; CHECK-NEXT: zip1 v18.4s, v2.4s, v1.4s
+; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT: zip1 v17.4s, v2.4s, v1.4s
+; CHECK-NEXT: trn2 v18.4s, v2.4s, v1.4s
; CHECK-NEXT: uzp2 v4.4s, v4.4s, v1.4s
; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT: mov v1.s[0], v2.s[1]
-; CHECK-NEXT: ext v2.16b, v2.16b, v16.16b, #12
-; CHECK-NEXT: mov v17.d[1], v6.d[1]
-; CHECK-NEXT: mov v7.d[1], v6.d[1]
+; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: ext v2.16b, v2.16b, v7.16b, #12
+; CHECK-NEXT: mov v16.d[1], v6.d[1]
+; CHECK-NEXT: mov v18.d[1], v5.d[1]
; CHECK-NEXT: mov v4.d[1], v0.d[1]
-; CHECK-NEXT: mov v18.d[1], v3.d[1]
-; CHECK-NEXT: mov v1.d[1], v5.d[1]
+; CHECK-NEXT: mov v17.d[1], v3.d[1]
+; CHECK-NEXT: mov v1.d[1], v6.d[1]
; CHECK-NEXT: mov v2.d[1], v0.d[1]
-; CHECK-NEXT: add v0.4s, v4.4s, v17.4s
-; CHECK-NEXT: add v3.4s, v1.4s, v18.4s
-; CHECK-NEXT: sub v1.4s, v18.4s, v1.4s
-; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s
+; CHECK-NEXT: add v0.4s, v4.4s, v16.4s
+; CHECK-NEXT: add v3.4s, v18.4s, v17.4s
+; CHECK-NEXT: sub v6.4s, v17.4s, v18.4s
+; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s
; CHECK-NEXT: rev64 v4.4s, v0.4s
; CHECK-NEXT: rev64 v5.4s, v3.4s
-; CHECK-NEXT: sub v6.4s, v1.4s, v2.4s
-; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: sub v2.4s, v6.4s, v1.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v6.4s
; CHECK-NEXT: mov v4.d[1], v0.d[1]
; CHECK-NEXT: mov v5.d[1], v3.d[1]
-; CHECK-NEXT: rev64 v2.4s, v6.4s
+; CHECK-NEXT: rev64 v6.4s, v2.4s
; CHECK-NEXT: rev64 v7.4s, v1.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s
; CHECK-NEXT: add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT: sub v4.4s, v2.4s, v6.4s
; CHECK-NEXT: sub v5.4s, v1.4s, v7.4s
-; CHECK-NEXT: addp v4.4s, v3.4s, v6.4s
+; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s
; CHECK-NEXT: addp v1.4s, v0.4s, v1.4s
; CHECK-NEXT: rev64 v6.4s, v0.4s
; CHECK-NEXT: rev64 v7.4s, v3.4s
-; CHECK-NEXT: ext v16.16b, v4.16b, v2.16b, #4
+; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #4
; CHECK-NEXT: ext v17.16b, v1.16b, v5.16b, #4
; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT: mov v7.16b, v2.16b
-; CHECK-NEXT: zip2 v6.4s, v16.4s, v4.4s
+; CHECK-NEXT: mov v7.16b, v4.16b
+; CHECK-NEXT: zip2 v6.4s, v16.4s, v2.4s
; CHECK-NEXT: mov v16.16b, v5.16b
; CHECK-NEXT: zip2 v17.4s, v17.4s, v1.4s
; CHECK-NEXT: ext v18.16b, v0.16b, v1.16b, #4
-; CHECK-NEXT: mov v7.s[2], v4.s[3]
+; CHECK-NEXT: mov v7.s[2], v2.s[3]
; CHECK-NEXT: mov v21.16b, v3.16b
; CHECK-NEXT: mov v16.s[2], v1.s[3]
; CHECK-NEXT: ext v5.16b, v5.16b, v17.16b, #12
; CHECK-NEXT: zip1 v17.4s, v1.4s, v1.4s
-; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #12
+; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12
; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4
; CHECK-NEXT: mov v19.16b, v7.16b
-; CHECK-NEXT: ext v6.16b, v3.16b, v4.16b, #8
-; CHECK-NEXT: mov v21.s[2], v4.s[1]
+; CHECK-NEXT: ext v6.16b, v3.16b, v2.16b, #8
+; CHECK-NEXT: mov v21.s[2], v2.s[1]
; CHECK-NEXT: mov v20.16b, v16.16b
-; CHECK-NEXT: mov v19.s[1], v4.s[2]
+; CHECK-NEXT: mov v19.s[1], v2.s[2]
; CHECK-NEXT: trn2 v0.4s, v17.4s, v0.4s
; CHECK-NEXT: sub v16.4s, v16.4s, v5.4s
; CHECK-NEXT: mov v17.16b, v18.16b
; CHECK-NEXT: ext v3.16b, v6.16b, v3.16b, #4
-; CHECK-NEXT: sub v7.4s, v7.4s, v2.4s
+; CHECK-NEXT: sub v7.4s, v7.4s, v4.4s
; CHECK-NEXT: mov v20.s[1], v1.s[2]
; CHECK-NEXT: mov v17.s[0], v1.s[1]
; CHECK-NEXT: mov v1.16b, v21.16b
-; CHECK-NEXT: add v2.4s, v19.4s, v2.4s
-; CHECK-NEXT: uzp2 v3.4s, v6.4s, v3.4s
+; CHECK-NEXT: add v4.4s, v19.4s, v4.4s
; CHECK-NEXT: add v5.4s, v20.4s, v5.4s
-; CHECK-NEXT: mov v1.s[1], v4.s[0]
-; CHECK-NEXT: sub v4.4s, v0.4s, v18.4s
-; CHECK-NEXT: mov v2.d[1], v7.d[1]
+; CHECK-NEXT: mov v1.s[1], v2.s[0]
+; CHECK-NEXT: uzp2 v2.4s, v6.4s, v3.4s
+; CHECK-NEXT: sub v3.4s, v0.4s, v18.4s
; CHECK-NEXT: add v0.4s, v0.4s, v17.4s
+; CHECK-NEXT: mov v4.d[1], v7.d[1]
; CHECK-NEXT: mov v5.d[1], v16.d[1]
-; CHECK-NEXT: sub v6.4s, v21.4s, v3.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: mov v0.d[1], v4.d[1]
-; CHECK-NEXT: cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT: cmlt v3.8h, v5.8h, #0
+; CHECK-NEXT: sub v6.4s, v21.4s, v2.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: mov v0.d[1], v3.d[1]
+; CHECK-NEXT: cmlt v3.8h, v4.8h, #0
+; CHECK-NEXT: cmlt v2.8h, v5.8h, #0
; CHECK-NEXT: mov v1.d[1], v6.d[1]
-; CHECK-NEXT: add v2.4s, v4.4s, v2.4s
; CHECK-NEXT: cmlt v6.8h, v0.8h, #0
-; CHECK-NEXT: add v5.4s, v3.4s, v5.4s
-; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT: add v4.4s, v3.4s, v4.4s
+; CHECK-NEXT: add v5.4s, v2.4s, v5.4s
; CHECK-NEXT: cmlt v7.8h, v1.8h, #0
; CHECK-NEXT: add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b
+; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT: eor v2.16b, v5.16b, v2.16b
; CHECK-NEXT: add v1.4s, v7.4s, v1.4s
; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b
-; CHECK-NEXT: add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: add v2.4s, v2.4s, v3.4s
; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
@@ -255,77 +255,76 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h
; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h
; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT: trn1 v18.4s, v1.4s, v0.4s
+; CHECK-NEXT: trn1 v6.4s, v1.4s, v0.4s
; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s
; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s
-; CHECK-NEXT: mov v7.16b, v3.16b
-; CHECK-NEXT: zip1 v6.4s, v2.4s, v3.4s
-; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s
+; CHECK-NEXT: zip1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT: trn2 v16.4s, v2.4s, v3.4s
+; CHECK-NEXT: ext v18.16b, v3.16b, v3.16b, #12
; CHECK-NEXT: ext v17.16b, v1.16b, v4.16b, #8
-; CHECK-NEXT: mov v7.s[0], v2.s[1]
-; CHECK-NEXT: ext v1.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT: zip2 v1.4s, v3.4s, v2.4s
; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s
+; CHECK-NEXT: mov v16.d[1], v4.d[1]
; CHECK-NEXT: zip2 v3.4s, v2.4s, v3.4s
-; CHECK-NEXT: mov v16.d[1], v18.d[1]
-; CHECK-NEXT: mov v6.d[1], v17.d[1]
-; CHECK-NEXT: mov v7.d[1], v4.d[1]
-; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #12
+; CHECK-NEXT: ext v2.16b, v2.16b, v18.16b, #12
+; CHECK-NEXT: mov v7.d[1], v17.d[1]
+; CHECK-NEXT: mov v1.d[1], v6.d[1]
; CHECK-NEXT: mov v5.d[1], v0.d[1]
-; CHECK-NEXT: mov v3.d[1], v18.d[1]
-; CHECK-NEXT: add v2.4s, v7.4s, v6.4s
-; CHECK-NEXT: mov v1.d[1], v0.d[1]
-; CHECK-NEXT: add v4.4s, v5.4s, v16.4s
-; CHECK-NEXT: rev64 v5.4s, v2.4s
-; CHECK-NEXT: rev64 v0.4s, v4.4s
-; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: sub v3.4s, v6.4s, v7.4s
-; CHECK-NEXT: mov v5.d[1], v2.d[1]
-; CHECK-NEXT: add v6.4s, v1.4s, v3.4s
-; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT: mov v0.d[1], v4.d[1]
-; CHECK-NEXT: add v4.4s, v4.4s, v5.4s
-; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: zip1 v2.4s, v4.4s, v6.4s
-; CHECK-NEXT: uzp2 v3.4s, v4.4s, v6.4s
-; CHECK-NEXT: zip2 v16.4s, v4.4s, v6.4s
-; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s
-; CHECK-NEXT: trn1 v7.4s, v0.4s, v1.4s
-; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT: trn2 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT: uzp2 v3.4s, v3.4s, v4.4s
-; CHECK-NEXT: mov v4.s[1], v6.s[1]
-; CHECK-NEXT: ext v0.16b, v0.16b, v5.16b, #8
-; CHECK-NEXT: mov v16.d[1], v7.d[1]
-; CHECK-NEXT: mov v3.d[1], v1.d[1]
-; CHECK-NEXT: mov v4.d[1], v5.d[1]
+; CHECK-NEXT: mov v3.d[1], v6.d[1]
; CHECK-NEXT: mov v2.d[1], v0.d[1]
-; CHECK-NEXT: add v0.4s, v16.4s, v3.4s
-; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s
-; CHECK-NEXT: add v1.4s, v4.4s, v2.4s
-; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT: add v4.4s, v16.4s, v7.4s
+; CHECK-NEXT: sub v6.4s, v7.4s, v16.4s
+; CHECK-NEXT: add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s
+; CHECK-NEXT: rev64 v5.4s, v4.4s
+; CHECK-NEXT: rev64 v0.4s, v1.4s
+; CHECK-NEXT: add v3.4s, v2.4s, v6.4s
+; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT: mov v5.d[1], v4.d[1]
+; CHECK-NEXT: mov v0.d[1], v1.d[1]
+; CHECK-NEXT: add v1.4s, v1.4s, v5.4s
+; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s
+; CHECK-NEXT: zip1 v4.4s, v1.4s, v3.4s
+; CHECK-NEXT: uzp2 v5.4s, v1.4s, v3.4s
+; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s
+; CHECK-NEXT: zip1 v6.4s, v0.4s, v2.4s
+; CHECK-NEXT: trn1 v16.4s, v0.4s, v2.4s
+; CHECK-NEXT: zip2 v2.4s, v0.4s, v2.4s
+; CHECK-NEXT: trn2 v4.4s, v1.4s, v4.4s
+; CHECK-NEXT: uzp2 v5.4s, v5.4s, v1.4s
+; CHECK-NEXT: mov v1.s[1], v3.s[1]
+; CHECK-NEXT: ext v0.16b, v0.16b, v6.16b, #8
+; CHECK-NEXT: mov v7.d[1], v16.d[1]
+; CHECK-NEXT: mov v5.d[1], v2.d[1]
+; CHECK-NEXT: mov v1.d[1], v6.d[1]
+; CHECK-NEXT: mov v4.d[1], v0.d[1]
+; CHECK-NEXT: add v0.4s, v7.4s, v5.4s
+; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s
+; CHECK-NEXT: add v2.4s, v1.4s, v4.4s
+; CHECK-NEXT: sub v1.4s, v4.4s, v1.4s
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #4
; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s
; CHECK-NEXT: zip2 v7.4s, v3.4s, v0.4s
-; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT: zip2 v16.4s, v2.4s, v1.4s
-; CHECK-NEXT: zip2 v17.4s, v1.4s, v2.4s
+; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT: zip2 v17.4s, v2.4s, v1.4s
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ext v18.16b, v4.16b, v3.16b, #8
-; CHECK-NEXT: ext v19.16b, v5.16b, v2.16b, #8
+; CHECK-NEXT: ext v19.16b, v5.16b, v1.16b, #8
+; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s
; CHECK-NEXT: add v2.4s, v16.4s, v7.4s
; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s
-; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4
-; CHECK-NEXT: cmlt v1.8h, v3.8h, #0
; CHECK-NEXT: cmlt v6.8h, v2.8h, #0
; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4
+; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: cmlt v1.8h, v3.8h, #0
; CHECK-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-NEXT: add v3.4s, v1.4s, v3.4s
; CHECK-NEXT: add v4.4s, v5.4s, v4.4s
; CHECK-NEXT: cmlt v5.8h, v0.8h, #0
-; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: cmlt v7.8h, v4.8h, #0
; CHECK-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
@@ -480,7 +479,7 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s
; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s
; CHECK-NEXT: uzp1 v7.4s, v1.4s, v0.4s
-; CHECK-NEXT: mov v6.s[3], v5.s[2]
+; CHECK-NEXT: trn1 v6.4s, v6.4s, v5.4s
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s
; CHECK-NEXT: zip1 v2.4s, v2.4s, v3.4s
>From 06a4039025d73b10a4a46fb11cde587bc0adae00 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Tue, 2 Dec 2025 22:17:24 +0000
Subject: [PATCH 2/5] rename, reformat, resimplify the added tests
---
llvm/test/CodeGen/AArch64/arm64-trn.ll | 74 +++++++++-----------------
1 file changed, 25 insertions(+), 49 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll
index 9c1c614551bdb..120c2d13a7ab7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-trn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll
@@ -246,85 +246,61 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
ret <4 x float> %tmp5
}
-define <8 x i8> @vtrni8_8first(ptr %A, ptr %B) nounwind {
-; CHECKLE-LABEL: vtrni8_8first:
+define <8 x i8> @vtrni8_trn1_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn1_flipped:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: ldr d0, [x0]
-; CHECKLE-NEXT: ldr d1, [x1]
-; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
-; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b
-; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT: trn1 v0.8b, v1.8b, v0.8b
; CHECKLE-NEXT: ret
;
-; CHECKBE-LABEL: vtrni8_8first:
+; CHECKBE-LABEL: vtrni8_trn1_flipped:
; CHECKBE: // %bb.0:
-; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
-; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
-; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
-; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b
-; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: rev64 v1.8b, v1.8b
+; CHECKBE-NEXT: trn1 v0.8b, v1.8b, v0.8b
; CHECKBE-NEXT: rev64 v0.8b, v0.8b
; CHECKBE-NEXT: ret
- %tmp1 = load <8 x i8>, ptr %A
- %tmp2 = load <8 x i8>, ptr %B
- %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
- %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
- %tmp5 = add <8 x i8> %tmp3, %tmp4
- ret <8 x i8> %tmp5
+ %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+ ret <8 x i8> %tmp1
}
-define <8 x i8> @vtrni8_9first(ptr %A, ptr %B) nounwind {
-; CHECKLE-LABEL: vtrni8_9first:
+define <8 x i8> @vtrni8_trn2_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn2_flipped:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: ldr d0, [x0]
-; CHECKLE-NEXT: ldr d1, [x1]
-; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b
-; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKLE-NEXT: ret
;
-; CHECKBE-LABEL: vtrni8_9first:
+; CHECKBE-LABEL: vtrni8_trn2_flipped:
; CHECKBE: // %bb.0:
-; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
-; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
-; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: rev64 v1.8b, v1.8b
; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b
-; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKBE-NEXT: rev64 v0.8b, v0.8b
; CHECKBE-NEXT: ret
- %tmp1 = load <8 x i8>, ptr %A
- %tmp2 = load <8 x i8>, ptr %B
- %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
- %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
- %tmp5 = add <8 x i8> %tmp3, %tmp4
- ret <8 x i8> %tmp5
+ %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+ ret <8 x i8> %tmp1
}
-define <8 x i8> @vtrni8_89first_undef(ptr %A, ptr %B) nounwind {
-; CHECKLE-LABEL: vtrni8_89first_undef:
+define <8 x i8> @vtrni8_both_flipped_with_poison_values(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_both_flipped_with_poison_values:
; CHECKLE: // %bb.0:
-; CHECKLE-NEXT: ldr d0, [x0]
-; CHECKLE-NEXT: ldr d1, [x1]
; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b
; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKLE-NEXT: ret
;
-; CHECKBE-LABEL: vtrni8_89first_undef:
+; CHECKBE-LABEL: vtrni8_both_flipped_with_poison_values:
; CHECKBE: // %bb.0:
-; CHECKBE-NEXT: ld1 { v0.8b }, [x0]
-; CHECKBE-NEXT: ld1 { v1.8b }, [x1]
+; CHECKBE-NEXT: rev64 v0.8b, v0.8b
+; CHECKBE-NEXT: rev64 v1.8b, v1.8b
; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b
; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b
; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b
; CHECKBE-NEXT: rev64 v0.8b, v0.8b
; CHECKBE-NEXT: ret
- %tmp1 = load <8 x i8>, ptr %A
- %tmp2 = load <8 x i8>, ptr %B
- %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 8, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
- %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 9, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
- %tmp5 = add <8 x i8> %tmp3, %tmp4
- ret <8 x i8> %tmp5
+ %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
+ %tmp2 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
+ %tmp3 = add <8 x i8> %tmp1, %tmp2
+ ret <8 x i8> %tmp3
}
; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
>From e55e58c61ecdd76144f369948e27baffc28666f2 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Tue, 2 Dec 2025 22:05:34 +0000
Subject: [PATCH 3/5] introduce TRN for better readability
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b06b3066831a9..a1c8fade7648e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -31592,10 +31592,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
OperandOrder)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
- return convertFromScalableVector(
- DAG, VT,
+ SDValue TRN =
DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
- OperandOrder == 0 ? Op2 : Op1));
+ OperandOrder == 0 ? Op2 : Op1);
+ return convertFromScalableVector(DAG, VT, TRN);
}
if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
>From 83ec81cd4117ad67c85ceb6ae319ae8577dbecba Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Tue, 2 Dec 2025 22:23:04 +0000
Subject: [PATCH 4/5] rename to more directly match WhichResultOut
---
.../Target/AArch64/AArch64PerfectShuffle.h | 30 +++++++++----------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index e6fa2ae7265f5..44108b49cc1cc 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6710,41 +6710,41 @@ inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
// "Variant" refers to the distinction bwetween trn1 and trn2, while
// "Order" refers to sequence of input registers (matching vs flipped).
- bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
- bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
- bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
- bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
+ bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
+ bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
+ bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
+ bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
// Check all elements match.
for (unsigned i = 0; i != NumElts; i += 2) {
if (M[i] >= 0) {
unsigned EvenElt = (unsigned)M[i];
if (EvenElt != i)
- Variant0Order0 = false;
+ Result0Order0 = false;
if (EvenElt != i + 1)
- Variant1Order0 = false;
+ Result1Order0 = false;
if (EvenElt != NumElts + i)
- Variant0Order1 = false;
+ Result0Order1 = false;
if (EvenElt != NumElts + i + 1)
- Variant1Order1 = false;
+ Result1Order1 = false;
}
if (M[i + 1] >= 0) {
unsigned OddElt = (unsigned)M[i + 1];
if (OddElt != NumElts + i)
- Variant0Order0 = false;
+ Result0Order0 = false;
if (OddElt != NumElts + i + 1)
- Variant1Order0 = false;
+ Result1Order0 = false;
if (OddElt != i)
- Variant0Order1 = false;
+ Result0Order1 = false;
if (OddElt != i + 1)
- Variant1Order1 = false;
+ Result1Order1 = false;
}
}
- if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1)
+ if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1)
return false;
- WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1;
- OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1;
+ WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1;
+ OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1;
return true;
}
>From 51437fc6e577629b9f2e6526e3cc8be0833739a7 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach at cantab.net>
Date: Wed, 3 Dec 2025 19:33:21 +0000
Subject: [PATCH 5/5] update explanatory comment
---
llvm/lib/Target/AArch64/AArch64PerfectShuffle.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 44108b49cc1cc..c7d6b31291197 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6708,8 +6708,9 @@ inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
if (NumElts % 2 != 0)
return false;
- // "Variant" refers to the distinction bwetween trn1 and trn2, while
- // "Order" refers to sequence of input registers (matching vs flipped).
+ // "Result" corresponds to "WhichResultOut", selecting between trn1 and trn2.
+ // "Order" corresponds to "OperandOrderOut", selecting the order of operands
+ // for the instruction (flipped or not).
bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
More information about the llvm-commits
mailing list