[llvm] ac434af - [AArch64] Try to fold shuffle (tbl2, tbl2) to tbl4.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 21 11:16:33 PDT 2022
Author: Florian Hahn
Date: 2022-09-21T19:15:56+01:00
New Revision: ac434afed8dda112ef637cb27437cd82cf80a6c8
URL: https://github.com/llvm/llvm-project/commit/ac434afed8dda112ef637cb27437cd82cf80a6c8
DIFF: https://github.com/llvm/llvm-project/commit/ac434afed8dda112ef637cb27437cd82cf80a6c8.diff
LOG: [AArch64] Try to fold shuffle (tbl2, tbl2) to tbl4.
shuffle (tbl2, tbl2) can be folded into a single tbl4 if the mask for
the selected elements is constant.
Reviewed By: t.p.northover
Differential Revision: https://reviews.llvm.org/D133491
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/arm64-tbl.ll
llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e268eabef6885..a63d0382dd903 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10795,6 +10795,51 @@ static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
return SDValue();
}
+// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
+static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
+ ArrayRef<int> ShuffleMask,
+ SelectionDAG &DAG) {
+ SDValue Tbl1 = Op->getOperand(0);
+ SDValue Tbl2 = Op->getOperand(1);
+ SDLoc dl(Op);
+ SDValue Tbl2ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
+
+ EVT VT = Op.getValueType();
+ if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
+ Tbl1->getOperand(0) != Tbl2ID ||
+ Tbl2->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
+ Tbl2->getOperand(0) != Tbl2ID)
+ return SDValue();
+
+ if (Tbl1->getValueType(0) != MVT::v16i8 ||
+ Tbl2->getValueType(0) != MVT::v16i8)
+ return SDValue();
+
+ SDValue Mask1 = Tbl1->getOperand(3);
+ SDValue Mask2 = Tbl2->getOperand(3);
+ SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
+ for (unsigned I = 0; I < 16; I++) {
+ if (ShuffleMask[I] < 16)
+ TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
+ else {
+ auto *C =
+ dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
+ if (!C)
+ return SDValue();
+ TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
+ }
+ }
+
+ SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
+
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
+ {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
+ Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
+}
+
SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -10818,6 +10863,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
assert(ShuffleMask.size() == VT.getVectorNumElements() &&
"Unexpected VECTOR_SHUFFLE mask size!");
+ if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
+ return Res;
+
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
index 5873a83d8822b..300bcbc503d01 100644
--- a/llvm/test/CodeGen/AArch64/arm64-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -130,27 +130,25 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8
; CHECK-NEXT: .byte 20 // 0x14
; CHECK-NEXT: .byte 24 // 0x18
; CHECK-NEXT: .byte 28 // 0x1c
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
-; CHECK-NEXT: .byte 255 // 0xff
+; CHECK-NEXT: .byte 32 // 0x20
+; CHECK-NEXT: .byte 36 // 0x24
+; CHECK-NEXT: .byte 40 // 0x28
+; CHECK-NEXT: .byte 44 // 0x2c
+; CHECK-NEXT: .byte 48 // 0x30
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 56 // 0x38
+; CHECK-NEXT: .byte 60 // 0x3c
define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; CHECK-LABEL: shuffled_tbl2_to_tbl4:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI9_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0]
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v4
-; CHECK-NEXT: mov.d v0[1], v1[0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4
; CHECK-NEXT: ret
%t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -161,24 +159,35 @@ define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c
define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff
-; CHECK-NEXT: adrp x8, .LCPI10_0
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI10_0]
-; CHECK-NEXT: mov.b v4[0], w0
-; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5
+; CHECK-NEXT: fmov s4, w0
+; CHECK-NEXT: mov w8, #32
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: mov.b v4[1], w0
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
; CHECK-NEXT: mov.b v4[2], w0
; CHECK-NEXT: mov.b v4[3], w0
; CHECK-NEXT: mov.b v4[4], w0
; CHECK-NEXT: mov.b v4[5], w0
; CHECK-NEXT: mov.b v4[6], w0
; CHECK-NEXT: mov.b v4[7], w0
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT: mov.d v0[1], v2[0]
+; CHECK-NEXT: mov.b v4[8], w8
+; CHECK-NEXT: mov w8, #36
+; CHECK-NEXT: mov.b v4[9], w8
+; CHECK-NEXT: mov w8, #40
+; CHECK-NEXT: mov.b v4[10], w8
+; CHECK-NEXT: mov w8, #44
+; CHECK-NEXT: mov.b v4[11], w8
+; CHECK-NEXT: mov w8, #48
+; CHECK-NEXT: mov.b v4[12], w8
+; CHECK-NEXT: mov w8, #52
+; CHECK-NEXT: mov.b v4[13], w8
+; CHECK-NEXT: mov w8, #56
+; CHECK-NEXT: mov.b v4[14], w8
+; CHECK-NEXT: mov w8, #60
+; CHECK-NEXT: mov.b v4[15], w8
+; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4
; CHECK-NEXT: ret
%ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
%ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
@@ -202,16 +211,72 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
ret <16 x i8> %s
}
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_first_mask2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #1
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: fmov s4, w8
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: mov.b v4[1], w8
+; CHECK-NEXT: mov.b v4[2], w8
+; CHECK-NEXT: mov.b v4[3], w8
+; CHECK-NEXT: mov.b v4[4], w8
+; CHECK-NEXT: mov.b v4[5], w8
+; CHECK-NEXT: mov.b v4[6], w8
+; CHECK-NEXT: mov w8, #32
+; CHECK-NEXT: mov.b v4[7], w0
+; CHECK-NEXT: mov.b v4[8], w8
+; CHECK-NEXT: mov w8, #36
+; CHECK-NEXT: mov.b v4[9], w8
+; CHECK-NEXT: mov w8, #40
+; CHECK-NEXT: mov.b v4[10], w8
+; CHECK-NEXT: mov w8, #44
+; CHECK-NEXT: mov.b v4[11], w8
+; CHECK-NEXT: mov w8, #48
+; CHECK-NEXT: mov.b v4[12], w8
+; CHECK-NEXT: mov w8, #52
+; CHECK-NEXT: mov.b v4[13], w8
+; CHECK-NEXT: mov w8, #56
+; CHECK-NEXT: mov.b v4[14], w8
+; CHECK-NEXT: mov w8, #31
+; CHECK-NEXT: mov.b v4[15], w8
+; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4
+; CHECK-NEXT: ret
+ %ins.0 = insertelement <16 x i8> poison, i8 1, i32 0
+ %ins.1 = insertelement <16 x i8> %ins.0, i8 1, i32 1
+ %ins.2 = insertelement <16 x i8> %ins.1, i8 1, i32 2
+ %ins.3 = insertelement <16 x i8> %ins.2, i8 1, i32 3
+ %ins.4 = insertelement <16 x i8> %ins.3, i8 1, i32 4
+ %ins.5 = insertelement <16 x i8> %ins.4, i8 1, i32 5
+ %ins.6 = insertelement <16 x i8> %ins.5, i8 1, i32 6
+ %ins.7 = insertelement <16 x i8> %ins.6, i8 1, i32 7
+ %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+ %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+ %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+ %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+ %ins.12 = insertelement <16 x i8> %ins.11, i8 %v, i32 12
+ %ins.13 = insertelement <16 x i8> %ins.12, i8 %v, i32 13
+ %ins.14 = insertelement <16 x i8> %ins.13, i8 -1, i32 14
+ %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15
+ %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+ %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 31>
+ ret <16 x i8> %s
+}
+
define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask:
; CHECK: // %bb.0:
; CHECK-NEXT: movi.2d v4, #0xffffffffffffffff
-; CHECK-NEXT: adrp x8, .LCPI11_0
+; CHECK-NEXT: adrp x8, .LCPI12_0
; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI11_0]
+; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI12_0]
; CHECK-NEXT: mov.b v4[0], w0
; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5
; CHECK-NEXT: mov.b v4[1], w0
@@ -247,20 +312,80 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
ret <16 x i8> %s
}
-define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_incompatible_shuffle:
+define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) {
+; CHECK-LABEL: shuffled_tbl2_to_tbl4_nonconst_second_mask2:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI12_0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: mov w8, #255
+; CHECK-NEXT: dup.16b v4, w0
+; CHECK-NEXT: adrp x9, .LCPI13_0
; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_0]
-; CHECK-NEXT: adrp x8, .LCPI12_1
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v4
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1]
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-NEXT: mov.b v4[8], w8
+; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI13_0]
+; CHECK-NEXT: mov.b v4[9], w8
+; CHECK-NEXT: tbl.16b v2, { v2, v3 }, v5
+; CHECK-NEXT: mov.b v4[10], w8
+; CHECK-NEXT: mov.b v4[11], w8
+; CHECK-NEXT: mov.b v4[12], w8
+; CHECK-NEXT: mov.b v4[13], w8
+; CHECK-NEXT: adrp x8, .LCPI13_1
+; CHECK-NEXT: tbl.16b v3, { v0, v1 }, v4
+; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI13_1]
+; CHECK-NEXT: tbl.16b v0, { v2, v3 }, v0
+; CHECK-NEXT: ret
+ %ins.0 = insertelement <16 x i8> poison, i8 %v, i32 0
+ %ins.1 = insertelement <16 x i8> %ins.0, i8 %v, i32 1
+ %ins.2 = insertelement <16 x i8> %ins.1, i8 %v, i32 2
+ %ins.3 = insertelement <16 x i8> %ins.2, i8 %v, i32 3
+ %ins.4 = insertelement <16 x i8> %ins.3, i8 %v, i32 4
+ %ins.5 = insertelement <16 x i8> %ins.4, i8 %v, i32 5
+ %ins.6 = insertelement <16 x i8> %ins.5, i8 %v, i32 6
+ %ins.7 = insertelement <16 x i8> %ins.6, i8 %v, i32 7
+ %ins.8 = insertelement <16 x i8> %ins.7, i8 -1, i32 8
+ %ins.9 = insertelement <16 x i8> %ins.8, i8 -1, i32 9
+ %ins.10 = insertelement <16 x i8> %ins.9, i8 -1, i32 10
+ %ins.11 = insertelement <16 x i8> %ins.10, i8 -1, i32 11
+ %ins.12 = insertelement <16 x i8> %ins.11, i8 -1, i32 12
+ %ins.13 = insertelement <16 x i8> %ins.12, i8 -1, i32 13
+ %ins.14 = insertelement <16 x i8> %ins.13, i8 %v, i32 14
+ %ins.15 = insertelement <16 x i8> %ins.14, i8 %v, i32 15
+ %t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %ins.15)
+ %s = shufflevector <16 x i8> %t1, <16 x i8> %t2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 30, i32 31>
+ ret <16 x i8> %s
+}
+
+
+; CHECK-LABEL: .LCPI14_0:
+; CHECK-NEXT: .byte 0 // 0x0
+; CHECK-NEXT: .byte 4 // 0x4
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 12 // 0xc
+; CHECK-NEXT: .byte 16 // 0x10
+; CHECK-NEXT: .byte 20 // 0x14
+; CHECK-NEXT: .byte 24 // 0x18
+; CHECK-NEXT: .byte 28 // 0x1c
+; CHECK-NEXT: .byte 32 // 0x20
+; CHECK-NEXT: .byte 36 // 0x24
+; CHECK-NEXT: .byte 40 // 0x28
+; CHECK-NEXT: .byte 44 // 0x2c
+; CHECK-NEXT: .byte 48 // 0x30
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 56 // 0x38
+; CHECK-NEXT: .byte 60 // 0x3c
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_shuffle:
+; CHECK: // %bb.0:
+; CHECK-NEXT: adrp x8, .LCPI14_0
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4
; CHECK-NEXT: ret
%t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -268,22 +393,34 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_shuffle(<16 x i8> %a, <16 x
ret <16 x i8> %s
}
-define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_incompatible_tbl2_mask1:
+; CHECK-LABEL: .LCPI15_0:
+; CHECK-NEXT: .byte 0 // 0x0
+; CHECK-NEXT: .byte 4 // 0x4
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 12 // 0xc
+; CHECK-NEXT: .byte 16 // 0x10
+; CHECK-NEXT: .byte 20 // 0x14
+; CHECK-NEXT: .byte 24 // 0x18
+; CHECK-NEXT: .byte 28 // 0x1c
+; CHECK-NEXT: .byte 32 // 0x20
+; CHECK-NEXT: .byte 36 // 0x24
+; CHECK-NEXT: .byte 40 // 0x28
+; CHECK-NEXT: .byte 44 // 0x2c
+; CHECK-NEXT: .byte 48 // 0x30
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 56 // 0x38
+; CHECK-NEXT: .byte 60 // 0x3c
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask1:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI13_0
-; CHECK-NEXT: adrp x9, .LCPI13_1
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_0]
-; CHECK-NEXT: adrp x8, .LCPI13_2
-; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI13_1]
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v5
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_2]
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2
+; CHECK-NEXT: adrp x8, .LCPI15_0
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI15_0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4
; CHECK-NEXT: ret
%t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -291,22 +428,34 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_tbl2_mask1(<16 x i8> %a, <1
ret <16 x i8> %s
}
-define <16 x i8> @shuffled_tbl2_to_tbl4_incompatible_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
-; CHECK-LABEL: shuffled_tbl2_to_tbl4_incompatible_tbl2_mask2:
+; CHECK-LABEL: .LCPI16_0:
+; CHECK-NEXT: .byte 0 // 0x0
+; CHECK-NEXT: .byte 4 // 0x4
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 12 // 0xc
+; CHECK-NEXT: .byte 16 // 0x10
+; CHECK-NEXT: .byte 20 // 0x14
+; CHECK-NEXT: .byte 24 // 0x18
+; CHECK-NEXT: .byte 28 // 0x1c
+; CHECK-NEXT: .byte 32 // 0x20
+; CHECK-NEXT: .byte 36 // 0x24
+; CHECK-NEXT: .byte 40 // 0x28
+; CHECK-NEXT: .byte 44 // 0x2c
+; CHECK-NEXT: .byte 48 // 0x30
+; CHECK-NEXT: .byte 52 // 0x34
+; CHECK-NEXT: .byte 56 // 0x38
+; CHECK-NEXT: .byte 60 // 0x3c
+
+define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: shuffled_tbl2_to_tbl4_mixed_tbl2_mask2:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI14_0
-; CHECK-NEXT: adrp x9, .LCPI14_1
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3
-; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI14_0]
-; CHECK-NEXT: adrp x8, .LCPI14_2
-; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI14_1]
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v4
-; CHECK-NEXT: tbl.16b v1, { v2, v3 }, v5
-; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2]
-; CHECK-NEXT: tbl.16b v0, { v0, v1 }, v2
+; CHECK-NEXT: adrp x8, .LCPI16_0
+; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_0]
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-NEXT: tbl.16b v0, { v0, v1, v2, v3 }, v4
; CHECK-NEXT: ret
%t1 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
%t2 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 78f60d635ee58..2cdbc109e3bf1 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -94,14 +94,14 @@ entry:
; CHECK-NEXT: .byte 20 ; 0x14
; CHECK-NEXT: .byte 24 ; 0x18
; CHECK-NEXT: .byte 28 ; 0x1c
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 32 ; 0x20
+; CHECK-NEXT: .byte 36 ; 0x24
+; CHECK-NEXT: .byte 40 ; 0x28
+; CHECK-NEXT: .byte 44 ; 0x2c
+; CHECK-NEXT: .byte 48 ; 0x30
+; CHECK-NEXT: .byte 52 ; 0x34
+; CHECK-NEXT: .byte 56 ; 0x38
+; CHECK-NEXT: .byte 60 ; 0x3c
; Tbl can also be used when combining multiple fptoui using a shuffle. The loop
; vectorizer may create such patterns.
@@ -118,16 +118,14 @@ define void @fptoui_2x_v8f32_to_v8i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: lsl x9, x8, #5
; CHECK-NEXT: add x10, x0, x9
; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q1, q2, [x10]
+; CHECK-NEXT: ldp q2, q1, [x10]
; CHECK-NEXT: ldp q4, q3, [x9]
-; CHECK-NEXT: fcvtzu.4s v6, v2
-; CHECK-NEXT: fcvtzu.4s v5, v1
-; CHECK-NEXT: fcvtzu.4s v2, v3
-; CHECK-NEXT: fcvtzu.4s v1, v4
-; CHECK-NEXT: tbl.16b v3, { v5, v6 }, v0
-; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0
-; CHECK-NEXT: mov.d v3[1], v1[0]
-; CHECK-NEXT: str q3, [x2, x8, lsl #4]
+; CHECK-NEXT: fcvtzu.4s v17, v1
+; CHECK-NEXT: fcvtzu.4s v16, v2
+; CHECK-NEXT: fcvtzu.4s v19, v3
+; CHECK-NEXT: fcvtzu.4s v18, v4
+; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT: str q1, [x2, x8, lsl #4]
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: b.eq LBB2_1
@@ -157,75 +155,50 @@ exit:
}
; CHECK-LABEL: lCPI3_0:
-; CHECK-NEXT: .byte 0 ; 0x0
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 8 ; 0x8
-; CHECK-NEXT: .byte 12 ; 0xc
-; CHECK-NEXT: .byte 16 ; 0x10
-; CHECK-NEXT: .byte 20 ; 0x14
-; CHECK-NEXT: .byte 24 ; 0x18
-; CHECK-NEXT: .byte 28 ; 0x1c
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: .byte 255 ; 0xff
-; CHECK-NEXT: lCPI3_1:
-; CHECK-NEXT: .byte 0 ; 0x0
-; CHECK-NEXT: .byte 17 ; 0x11
-; CHECK-NEXT: .byte 2 ; 0x2
-; CHECK-NEXT: .byte 3 ; 0x3
-; CHECK-NEXT: .byte 4 ; 0x4
-; CHECK-NEXT: .byte 5 ; 0x5
-; CHECK-NEXT: .byte 6 ; 0x6
-; CHECK-NEXT: .byte 19 ; 0x13
-; CHECK-NEXT: .byte 16 ; 0x10
-; CHECK-NEXT: .byte 17 ; 0x11
-; CHECK-NEXT: .byte 18 ; 0x12
-; CHECK-NEXT: .byte 19 ; 0x13
-; CHECK-NEXT: .byte 20 ; 0x14
-; CHECK-NEXT: .byte 3 ; 0x3
-; CHECK-NEXT: .byte 22 ; 0x16
-; CHECK-NEXT: .byte 23 ; 0x17
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 36 ; 0x24
+; CHECK-NEXT: .byte 8 ; 0x8
+; CHECK-NEXT: .byte 12 ; 0xc
+; CHECK-NEXT: .byte 16 ; 0x10
+; CHECK-NEXT: .byte 20 ; 0x14
+; CHECK-NEXT: .byte 24 ; 0x18
+; CHECK-NEXT: .byte 44 ; 0x2c
+; CHECK-NEXT: .byte 32 ; 0x20
+; CHECK-NEXT: .byte 36 ; 0x24
+; CHECK-NEXT: .byte 40 ; 0x28
+; CHECK-NEXT: .byte 44 ; 0x2c
+; CHECK-NEXT: .byte 48 ; 0x30
+; CHECK-NEXT: .byte 12 ; 0xc
+; CHECK-NEXT: .byte 56 ; 0x38
+; CHECK-NEXT: .byte 60 ; 0x3c
-; We need multiple tbl for the shuffle.
define void @fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle(ptr %A, ptr %B, ptr %dst) {
; CHECK-LABEL: fptoui_2x_v8f32_to_v8i8_in_loop_no_concat_shuffle:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: Lloh4:
; CHECK-NEXT: adrp x9, lCPI3_0 at PAGE
-; CHECK-NEXT: Lloh5:
-; CHECK-NEXT: adrp x10, lCPI3_1 at PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh6:
+; CHECK-NEXT: Lloh5:
; CHECK-NEXT: ldr q0, [x9, lCPI3_0 at PAGEOFF]
-; CHECK-NEXT: Lloh7:
-; CHECK-NEXT: ldr q1, [x10, lCPI3_1 at PAGEOFF]
; CHECK-NEXT: LBB3_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lsl x9, x8, #5
; CHECK-NEXT: add x10, x0, x9
; CHECK-NEXT: add x9, x1, x9
-; CHECK-NEXT: ldp q2, q3, [x10]
-; CHECK-NEXT: ldp q5, q4, [x9]
-; CHECK-NEXT: fcvtzu.4s v7, v3
-; CHECK-NEXT: fcvtzu.4s v6, v2
-; CHECK-NEXT: fcvtzu.4s v3, v4
-; CHECK-NEXT: fcvtzu.4s v2, v5
-; CHECK-NEXT: tbl.16b v4, { v6, v7 }, v0
-; CHECK-NEXT: tbl.16b v5, { v2, v3 }, v0
-; CHECK-NEXT: tbl.16b v2, { v4, v5 }, v1
-; CHECK-NEXT: str q2, [x2, x8, lsl #4]
+; CHECK-NEXT: ldp q2, q1, [x10]
+; CHECK-NEXT: ldp q4, q3, [x9]
+; CHECK-NEXT: fcvtzu.4s v17, v1
+; CHECK-NEXT: fcvtzu.4s v16, v2
+; CHECK-NEXT: fcvtzu.4s v19, v3
+; CHECK-NEXT: fcvtzu.4s v18, v4
+; CHECK-NEXT: tbl.16b v1, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT: str q1, [x2, x8, lsl #4]
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: b.eq LBB3_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh5, Lloh7
-; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh6
+; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
entry:
br label %loop
@@ -269,10 +242,10 @@ exit:
define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: fptoui_v16f32_to_v16i8_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh8:
+; CHECK-NEXT: Lloh6:
; CHECK-NEXT: adrp x9, lCPI4_0 at PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh9:
+; CHECK-NEXT: Lloh7:
; CHECK-NEXT: ldr q0, [x9, lCPI4_0 at PAGEOFF]
; CHECK-NEXT: LBB4_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -290,7 +263,7 @@ define void @fptoui_v16f32_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-NEXT: b.eq LBB4_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9
+; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
entry:
br label %loop
@@ -330,10 +303,10 @@ exit:
define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-LABEL: fptoui_2x_v16f32_to_v16i8_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh10:
+; CHECK-NEXT: Lloh8:
; CHECK-NEXT: adrp x9, lCPI5_0 at PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh11:
+; CHECK-NEXT: Lloh9:
; CHECK-NEXT: ldr q0, [x9, lCPI5_0 at PAGEOFF]
; CHECK-NEXT: LBB5_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -361,7 +334,7 @@ define void @fptoui_2x_v16f32_to_v16i8_in_loop(ptr %A, ptr %B, ptr %dst) {
; CHECK-NEXT: b.eq LBB5_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh11
+; CHECK-NEXT: .loh AdrpLdr Lloh8, Lloh9
entry:
br label %loop
@@ -503,14 +476,14 @@ exit:
define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
; CHECK-LABEL: uitofp_v8i8_to_v8f32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh12:
+; CHECK-NEXT: Lloh10:
; CHECK-NEXT: adrp x9, lCPI8_0 at PAGE
-; CHECK-NEXT: Lloh13:
+; CHECK-NEXT: Lloh11:
; CHECK-NEXT: adrp x10, lCPI8_1 at PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh14:
+; CHECK-NEXT: Lloh12:
; CHECK-NEXT: ldr q0, [x9, lCPI8_0 at PAGEOFF]
-; CHECK-NEXT: Lloh15:
+; CHECK-NEXT: Lloh13:
; CHECK-NEXT: ldr q1, [x10, lCPI8_1 at PAGEOFF]
; CHECK-NEXT: LBB8_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -526,8 +499,8 @@ define void @uitofp_v8i8_to_v8f32(ptr %src, ptr %dst) {
; CHECK-NEXT: b.eq LBB8_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh13, Lloh15
-; CHECK-NEXT: .loh AdrpLdr Lloh12, Lloh14
+; CHECK-NEXT: .loh AdrpLdr Lloh11, Lloh13
+; CHECK-NEXT: .loh AdrpLdr Lloh10, Lloh12
entry:
br label %loop
@@ -618,22 +591,22 @@ exit:
define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-LABEL: uitofp_v16i8_to_v16f32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: Lloh16:
+; CHECK-NEXT: Lloh14:
; CHECK-NEXT: adrp x9, lCPI9_0 at PAGE
-; CHECK-NEXT: Lloh17:
+; CHECK-NEXT: Lloh15:
; CHECK-NEXT: adrp x10, lCPI9_1 at PAGE
-; CHECK-NEXT: Lloh18:
+; CHECK-NEXT: Lloh16:
; CHECK-NEXT: adrp x11, lCPI9_2 at PAGE
-; CHECK-NEXT: Lloh19:
+; CHECK-NEXT: Lloh17:
; CHECK-NEXT: adrp x12, lCPI9_3 at PAGE
; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: Lloh20:
+; CHECK-NEXT: Lloh18:
; CHECK-NEXT: ldr q0, [x9, lCPI9_0 at PAGEOFF]
-; CHECK-NEXT: Lloh21:
+; CHECK-NEXT: Lloh19:
; CHECK-NEXT: ldr q1, [x10, lCPI9_1 at PAGEOFF]
-; CHECK-NEXT: Lloh22:
+; CHECK-NEXT: Lloh20:
; CHECK-NEXT: ldr q2, [x11, lCPI9_2 at PAGEOFF]
-; CHECK-NEXT: Lloh23:
+; CHECK-NEXT: Lloh21:
; CHECK-NEXT: ldr q3, [x12, lCPI9_3 at PAGEOFF]
; CHECK-NEXT: LBB9_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -654,10 +627,10 @@ define void @uitofp_v16i8_to_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: b.eq LBB9_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
-; CHECK-NEXT: .loh AdrpLdr Lloh19, Lloh23
-; CHECK-NEXT: .loh AdrpLdr Lloh18, Lloh22
; CHECK-NEXT: .loh AdrpLdr Lloh17, Lloh21
; CHECK-NEXT: .loh AdrpLdr Lloh16, Lloh20
+; CHECK-NEXT: .loh AdrpLdr Lloh15, Lloh19
+; CHECK-NEXT: .loh AdrpLdr Lloh14, Lloh18
entry:
br label %loop
More information about the llvm-commits
mailing list