[llvm] 02d09ff - [AArch64] Extending lowering of 'trunc <(8|16) x i64> %x to <(8|16) x i8>' to use tbl instructions
Nilanjana Basu via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 15 07:21:22 PST 2022
Author: Nilanjana Basu
Date: 2022-12-15T20:50:40+05:30
New Revision: 02d09ffc1b09fe025272231a6ebebcc7f883e8e6
URL: https://github.com/llvm/llvm-project/commit/02d09ffc1b09fe025272231a6ebebcc7f883e8e6
DIFF: https://github.com/llvm/llvm-project/commit/02d09ffc1b09fe025272231a6ebebcc7f883e8e6.diff
LOG: [AArch64] Extending lowering of 'trunc <(8|16) x i64> %x to <(8|16) x i8>' to use tbl instructions
[AArch64] Patch for lowering trunc instructions to 'tbl' for (8|16)xi32 -> (8|16)xi8 conversions in https://reviews.llvm.org/D133495 is extended to support trunc to tbl lowering for (8|16) x i64 to (8|16) x i8.
A microbenchmark for runtime for these transformations is added in https://reviews.llvm.org/D136274
Reviewed by: fhahn, t.p.northover
Differential Revision: https://reviews.llvm.org/D135229
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3b5203f5e9c5..04c9ff82fa4b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13920,38 +13920,116 @@ static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
IRBuilder<> Builder(TI);
SmallVector<Value *> Parts;
+ int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
+ auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
+ auto *DstTy = cast<FixedVectorType>(TI->getType());
+ assert(SrcTy->getElementType()->isIntegerTy() &&
+ "Non-integer type source vector element is not supported");
+ assert(DstTy->getElementType()->isIntegerTy(8) &&
+ "Unsupported destination vector element type");
+ unsigned SrcElemTySz =
+ cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
+ unsigned TruncFactor =
+ SrcElemTySz / cast<IntegerType>(DstTy->getElementType())->getBitWidth();
+ assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
+ "Unsupported source vector element type size");
Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
- Parts.push_back(Builder.CreateBitCast(
- Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy));
- Parts.push_back(Builder.CreateBitCast(
- Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy));
-
- Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2;
- unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
- if (NumElements == 16) {
- Parts.push_back(Builder.CreateBitCast(
- Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy));
- Parts.push_back(Builder.CreateBitCast(
- Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}),
- VecTy));
- TblID = Intrinsic::aarch64_neon_tbl4;
- }
+
+ // Create a mask to choose every nth byte from the source vector table of
+ // bytes to create the truncated destination vector, where 'n' is the truncate
+ // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
+ // 0,8,16,..Y*8th bytes for the little-endian format
SmallVector<Constant *, 16> MaskConst;
- for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4)
- MaskConst.push_back(
- ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3));
+ for (int Itr = 0; Itr < 16; Itr++) {
+ if (Itr < NumElements)
+ MaskConst.push_back(ConstantInt::get(
+ Builder.getInt8Ty(), IsLittleEndian
+ ? Itr * TruncFactor
+ : Itr * TruncFactor + (TruncFactor - 1)));
+ else
+ MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));
+ }
+
+ int MaxTblSz = 128 * 4;
+ int MaxSrcSz = SrcElemTySz * NumElements;
+ int ElemsPerTbl =
+ (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
+ assert(ElemsPerTbl <= 16 &&
+ "Maximum elements selected using TBL instruction cannot exceed 16!");
+
+ int ShuffleCount = 128 / SrcElemTySz;
+ SmallVector<int> ShuffleLanes;
+ for (int i = 0; i < ShuffleCount; ++i)
+ ShuffleLanes.push_back(i);
+
+ // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
+ // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
+ // call TBL & save the result in a vector of TBL results for combining later.
+ SmallVector<Value *> Results;
+ while (ShuffleLanes.back() < NumElements) {
+ Parts.push_back(Builder.CreateBitCast(
+ Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
+
+ if (Parts.size() >= 4) {
+ auto *F = Intrinsic::getDeclaration(TI->getModule(),
+ Intrinsic::aarch64_neon_tbl4, VecTy);
+ Parts.push_back(ConstantVector::get(MaskConst));
+ Results.push_back(Builder.CreateCall(F, Parts));
+ Parts.clear();
+ }
+
+ for (int i = 0; i < ShuffleCount; ++i)
+ ShuffleLanes[i] += ShuffleCount;
+ }
+
+ assert((Parts.empty() || Results.empty()) &&
+ "Lowering trunc for vectors requiring
diff erent TBL instructions is "
+ "not supported!");
+ // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
+ // registers
+ if (!Parts.empty()) {
+ Intrinsic::ID TblID;
+ switch (Parts.size()) {
+ case 1:
+ TblID = Intrinsic::aarch64_neon_tbl1;
+ break;
+ case 2:
+ TblID = Intrinsic::aarch64_neon_tbl2;
+ break;
+ case 3:
+ TblID = Intrinsic::aarch64_neon_tbl3;
+ break;
+ }
- for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4)
- MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));
+ auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
+ Parts.push_back(ConstantVector::get(MaskConst));
+ Results.push_back(Builder.CreateCall(F, Parts));
+ }
- Parts.push_back(ConstantVector::get(MaskConst));
- auto *F =
- Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType());
- Value *Res = Builder.CreateCall(F, Parts);
+ // Extract the destination vector from TBL result(s) after combining them
+ // where applicable. Currently, at most two TBLs are supported.
+ assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
+ "more than 2 tbl instructions!");
+ Value *FinalResult = Results[0];
+ if (Results.size() == 1) {
+ if (ElemsPerTbl < 16) {
+ SmallVector<int> FinalMask(ElemsPerTbl);
+ std::iota(FinalMask.begin(), FinalMask.end(), 0);
+ FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
+ }
+ } else {
+ SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
+ if (ElemsPerTbl < 16) {
+ std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
+ std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
+ } else {
+ std::iota(FinalMask.begin(), FinalMask.end(), 0);
+ }
+ FinalResult =
+ Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
+ }
- if (NumElements == 8)
- Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7});
- TI->replaceAllUsesWith(Res);
+ TI->replaceAllUsesWith(FinalResult);
TI->eraseFromParent();
}
@@ -14013,13 +14091,15 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
return true;
}
- // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
- // instruction selecting the lowest 8 bits per lane of the input interpreted
- // as 2 or 4 <4 x i32> vectors.
+ // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
+ // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
+ // per lane of the input that is represented using 1,2,3 or 4 128-bit table
+ // registers
auto *TI = dyn_cast<TruncInst>(I);
- if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
- SrcTy->getElementType()->isIntegerTy(32) &&
- DstTy->getElementType()->isIntegerTy(8)) {
+ if (TI && DstTy->getElementType()->isIntegerTy(8) &&
+ ((SrcTy->getElementType()->isIntegerTy(32) ||
+ SrcTy->getElementType()->isIntegerTy(64)) &&
+ (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
createTblForTrunc(TI, Subtarget->isLittleEndian());
return true;
}
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 952e2320d154..f6f0b1d41788 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -235,66 +235,100 @@ exit:
ret void
}
+; CHECK-LABEL: lCPI3_0:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 8 ; 0x8
+; CHECK-NEXT: .byte 16 ; 0x10
+; CHECK-NEXT: .byte 24 ; 0x18
+; CHECK-NEXT: .byte 32 ; 0x20
+; CHECK-NEXT: .byte 40 ; 0x28
+; CHECK-NEXT: .byte 48 ; 0x30
+; CHECK-NEXT: .byte 56 ; 0x38
+; CHECK-NEXT: .byte 64 ; 0x40
+; CHECK-NEXT: .byte 72 ; 0x48
+; CHECK-NEXT: .byte 80 ; 0x50
+; CHECK-NEXT: .byte 88 ; 0x58
+; CHECK-NEXT: .byte 96 ; 0x60
+; CHECK-NEXT: .byte 104 ; 0x68
+; CHECK-NEXT: .byte 112 ; 0x70
+; CHECK-NEXT: .byte 120 ; 0x78
+
+; CHECK-BE-LABEL: .LCPI3_0:
+; CHECK-BE-NEXT: .byte 7 // 0x7
+; CHECK-BE-NEXT: .byte 15 // 0xf
+; CHECK-BE-NEXT: .byte 23 // 0x17
+; CHECK-BE-NEXT: .byte 31 // 0x1f
+; CHECK-BE-NEXT: .byte 39 // 0x27
+; CHECK-BE-NEXT: .byte 47 // 0x2f
+; CHECK-BE-NEXT: .byte 55 // 0x37
+; CHECK-BE-NEXT: .byte 63 // 0x3f
+; CHECK-BE-NEXT: .byte 71 // 0x47
+; CHECK-BE-NEXT: .byte 79 // 0x4f
+; CHECK-BE-NEXT: .byte 87 // 0x57
+; CHECK-BE-NEXT: .byte 95 // 0x5f
+; CHECK-BE-NEXT: .byte 103 // 0x67
+; CHECK-BE-NEXT: .byte 111 // 0x6f
+; CHECK-BE-NEXT: .byte 119 // 0x77
+; CHECK-BE-NEXT: .byte 127 // 0x7f
define void @trunc_v16i64_to_v16i8_in_loop(ptr %A, ptr %dst) {
-; CHECK-LABEL: trunc_v16i64_to_v16i8_in_loop:
-; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: LBB3_1: ; %loop
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #7
-; CHECK-NEXT: ldp q3, q2, [x9, #96]
-; CHECK-NEXT: ldp q1, q0, [x9, #32]
-; CHECK-NEXT: uzp1.4s v2, v3, v2
-; CHECK-NEXT: ldp q5, q4, [x9, #64]
-; CHECK-NEXT: uzp1.4s v0, v1, v0
-; CHECK-NEXT: ldp q3, q6, [x9]
-; CHECK-NEXT: uzp1.4s v4, v5, v4
-; CHECK-NEXT: uzp1.8h v2, v4, v2
-; CHECK-NEXT: uzp1.4s v1, v3, v6
-; CHECK-NEXT: uzp1.8h v0, v1, v0
-; CHECK-NEXT: uzp1.16b v0, v0, v2
-; CHECK-NEXT: str q0, [x1, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: b.eq LBB3_1
-; CHECK-NEXT: ; %bb.2: ; %exit
-; CHECK-NEXT: ret
-;
-; CHECK-BE-LABEL: trunc_v16i64_to_v16i8_in_loop:
-; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: mov x8, xzr
-; CHECK-BE-NEXT: .LBB3_1: // %loop
-; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: add x9, x0, x8, lsl #7
-; CHECK-BE-NEXT: add x10, x9, #48
-; CHECK-BE-NEXT: add x11, x9, #32
-; CHECK-BE-NEXT: ld1 { v5.2d }, [x9]
-; CHECK-BE-NEXT: ld1 { v0.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #80
-; CHECK-BE-NEXT: ld1 { v1.2d }, [x11]
-; CHECK-BE-NEXT: add x11, x9, #112
-; CHECK-BE-NEXT: ld1 { v2.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #96
-; CHECK-BE-NEXT: ld1 { v3.2d }, [x11]
-; CHECK-BE-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-BE-NEXT: ld1 { v4.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #64
-; CHECK-BE-NEXT: add x9, x9, #16
-; CHECK-BE-NEXT: ld1 { v6.2d }, [x10]
-; CHECK-BE-NEXT: ld1 { v7.2d }, [x9]
-; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
-; CHECK-BE-NEXT: uzp1 v3.4s, v4.4s, v3.4s
-; CHECK-BE-NEXT: add x8, x8, #1
-; CHECK-BE-NEXT: cmp x8, #1000
-; CHECK-BE-NEXT: uzp1 v2.4s, v6.4s, v2.4s
-; CHECK-BE-NEXT: uzp1 v1.4s, v5.4s, v7.4s
-; CHECK-BE-NEXT: uzp1 v2.8h, v2.8h, v3.8h
-; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-BE-NEXT: st1 { v0.16b }, [x9]
-; CHECK-BE-NEXT: b.eq .LBB3_1
-; CHECK-BE-NEXT: // %bb.2: // %exit
-; CHECK-BE-NEXT: ret
+; CHECK-LABEL: trunc_v16i64_to_v16i8_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: Lloh4:
+; CHECK-NEXT: adrp x9, lCPI3_0 at PAGE
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: Lloh5:
+; CHECK-NEXT: ldr q0, [x9, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT: LBB3_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8, lsl #7
+; CHECK-NEXT: ldp q1, q2, [x9]
+; CHECK-NEXT: ldp q3, q4, [x9, #32]
+; CHECK-NEXT: ldp q16, q17, [x9, #64]
+; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0
+; CHECK-NEXT: ldp q18, q19, [x9, #96]
+; CHECK-NEXT: tbl.16b v2, { v16, v17, v18, v19 }, v0
+; CHECK-NEXT: mov.d v1[1], v2[0]
+; CHECK-NEXT: str q1, [x1, x8, lsl #4]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: b.eq LBB3_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
+; CHECK-BE-LABEL: trunc_v16i64_to_v16i8_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: adrp x8, .LCPI3_0
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI3_0
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB3_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8, lsl #7
+; CHECK-BE-NEXT: add x10, x9, #16
+; CHECK-BE-NEXT: add x11, x9, #32
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
+; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
+; CHECK-BE-NEXT: add x10, x9, #48
+; CHECK-BE-NEXT: ld1 { v3.16b }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #64
+; CHECK-BE-NEXT: ld1 { v4.16b }, [x10]
+; CHECK-BE-NEXT: add x10, x9, #80
+; CHECK-BE-NEXT: ld1 { v16.16b }, [x11]
+; CHECK-BE-NEXT: add x11, x9, #96
+; CHECK-BE-NEXT: add x9, x9, #112
+; CHECK-BE-NEXT: ld1 { v17.16b }, [x10]
+; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
+; CHECK-BE-NEXT: ld1 { v18.16b }, [x11]
+; CHECK-BE-NEXT: ld1 { v19.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
+; CHECK-BE-NEXT: add x8, x8, #1
+; CHECK-BE-NEXT: cmp x8, #1000
+; CHECK-BE-NEXT: tbl v2.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b
+; CHECK-BE-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-BE-NEXT: st1 { v1.16b }, [x9]
+; CHECK-BE-NEXT: b.eq .LBB3_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
@@ -314,50 +348,87 @@ exit:
ret void
}
+; CHECK-LABEL: lCPI4_0:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 8 ; 0x8
+; CHECK-NEXT: .byte 16 ; 0x10
+; CHECK-NEXT: .byte 24 ; 0x18
+; CHECK-NEXT: .byte 32 ; 0x20
+; CHECK-NEXT: .byte 40 ; 0x28
+; CHECK-NEXT: .byte 48 ; 0x30
+; CHECK-NEXT: .byte 56 ; 0x38
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+
+; CHECK-BE-LABEL: .LCPI4_0:
+; CHECK-BE-NEXT: .byte 7 // 0x7
+; CHECK-BE-NEXT: .byte 15 // 0xf
+; CHECK-BE-NEXT: .byte 23 // 0x17
+; CHECK-BE-NEXT: .byte 31 // 0x1f
+; CHECK-BE-NEXT: .byte 39 // 0x27
+; CHECK-BE-NEXT: .byte 47 // 0x2f
+; CHECK-BE-NEXT: .byte 55 // 0x37
+; CHECK-BE-NEXT: .byte 63 // 0x3f
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
define void @trunc_v8i64_to_v8i8_in_loop(ptr %A, ptr %dst) {
-; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop:
-; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: LBB4_1: ; %loop
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #6
-; CHECK-NEXT: ldp q1, q0, [x9, #32]
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: uzp1.4s v0, v1, v0
-; CHECK-NEXT: uzp1.4s v1, v3, v2
-; CHECK-NEXT: uzp1.8h v0, v1, v0
-; CHECK-NEXT: xtn.8b v0, v0
-; CHECK-NEXT: str d0, [x1, x8, lsl #3]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: b.eq LBB4_1
-; CHECK-NEXT: ; %bb.2: ; %exit
-; CHECK-NEXT: ret
-;
-; CHECK-BE-LABEL: trunc_v8i64_to_v8i8_in_loop:
-; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: mov x8, xzr
-; CHECK-BE-NEXT: .LBB4_1: // %loop
-; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: add x9, x0, x8, lsl #6
-; CHECK-BE-NEXT: add x10, x9, #48
-; CHECK-BE-NEXT: ld1 { v1.2d }, [x9]
-; CHECK-BE-NEXT: ld1 { v0.2d }, [x10]
-; CHECK-BE-NEXT: add x10, x9, #32
-; CHECK-BE-NEXT: add x9, x9, #16
-; CHECK-BE-NEXT: ld1 { v2.2d }, [x10]
-; CHECK-BE-NEXT: ld1 { v3.2d }, [x9]
-; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
-; CHECK-BE-NEXT: add x8, x8, #1
-; CHECK-BE-NEXT: cmp x8, #1000
-; CHECK-BE-NEXT: uzp1 v0.4s, v2.4s, v0.4s
-; CHECK-BE-NEXT: uzp1 v1.4s, v1.4s, v3.4s
-; CHECK-BE-NEXT: uzp1 v0.8h, v1.8h, v0.8h
-; CHECK-BE-NEXT: xtn v0.8b, v0.8h
-; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
-; CHECK-BE-NEXT: b.eq .LBB4_1
-; CHECK-BE-NEXT: // %bb.2: // %exit
-; CHECK-BE-NEXT: ret
+; CHECK-LABEL: trunc_v8i64_to_v8i8_in_loop:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: Lloh6:
+; CHECK-NEXT: adrp x9, lCPI4_0 at PAGE
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: Lloh7:
+; CHECK-NEXT: ldr q0, [x9, lCPI4_0 at PAGEOFF]
+; CHECK-NEXT: LBB4_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8, lsl #6
+; CHECK-NEXT: ldp q1, q2, [x9]
+; CHECK-NEXT: ldp q3, q4, [x9, #32]
+; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0
+; CHECK-NEXT: str d1, [x1, x8, lsl #3]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: b.eq LBB4_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
+
+; CHECK-BE-LABEL: trunc_v8i64_to_v8i8_in_loop:
+; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: adrp x8, .LCPI4_0
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI4_0
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB4_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8, lsl #6
+; CHECK-BE-NEXT: add x10, x9, #16
+; CHECK-BE-NEXT: add x11, x9, #32
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x9, #48
+; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
+; CHECK-BE-NEXT: ld1 { v3.16b }, [x11]
+; CHECK-BE-NEXT: ld1 { v4.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
+; CHECK-BE-NEXT: add x8, x8, #1
+; CHECK-BE-NEXT: cmp x8, #1000
+; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
+; CHECK-BE-NEXT: st1 { v1.8b }, [x9]
+; CHECK-BE-NEXT: b.eq .LBB4_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
entry:
br label %loop
@@ -558,36 +629,38 @@ exit:
define void @trunc_v16i16_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: trunc_v16i16_to_v16i8_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: LBB7_1: ; %loop
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: ldp q1, q0, [x9]
-; CHECK-NEXT: uzp1.16b v0, v1, v0
-; CHECK-NEXT: str q0, [x1, x8, lsl #4]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: b.eq LBB7_1
-; CHECK-NEXT: ; %bb.2: ; %exit
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB7_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: add x9, x0, x8, lsl #5
+; CHECK-NEXT: ldp q1, q0, [x9]
+; CHECK-NEXT: uzp1.16b v0, v1, v0
+; CHECK-NEXT: str q0, [x1, x8, lsl #4]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: b.eq LBB7_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
; CHECK-BE-LABEL: trunc_v16i16_to_v16i8_in_loop:
; CHECK-BE: // %bb.0: // %entry
; CHECK-BE-NEXT: mov x8, xzr
-; CHECK-BE-NEXT: .LBB7_1: // %loop
-; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: add x9, x0, x8, lsl #5
-; CHECK-BE-NEXT: add x10, x9, #16
-; CHECK-BE-NEXT: ld1 { v0.8h }, [x9]
-; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
-; CHECK-BE-NEXT: add x8, x8, #1
-; CHECK-BE-NEXT: ld1 { v1.8h }, [x10]
-; CHECK-BE-NEXT: cmp x8, #1000
-; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-BE-NEXT: st1 { v0.16b }, [x9]
-; CHECK-BE-NEXT: b.eq .LBB7_1
-; CHECK-BE-NEXT: // %bb.2: // %exit
-; CHECK-BE-NEXT: ret
+; CHECK-BE-NEXT: .LBB7_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8, lsl #5
+; CHECK-BE-NEXT: add x10, x9, #16
+; CHECK-BE-NEXT: ld1 { v0.8h }, [x9]
+; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
+; CHECK-BE-NEXT: add x8, x8, #1
+; CHECK-BE-NEXT: ld1 { v1.8h }, [x10]
+; CHECK-BE-NEXT: cmp x8, #1000
+; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-BE-NEXT: st1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: b.eq .LBB7_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
entry:
br label %loop
@@ -610,33 +683,35 @@ exit:
define void @trunc_v8i16_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: trunc_v8i16_to_v8i8_in_loop:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: mov x8, xzr
-; CHECK-NEXT: LBB8_1: ; %loop
-; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldr q0, [x0, x8, lsl #4]
-; CHECK-NEXT: xtn.8b v0, v0
-; CHECK-NEXT: str d0, [x1, x8, lsl #3]
-; CHECK-NEXT: add x8, x8, #1
-; CHECK-NEXT: cmp x8, #1000
-; CHECK-NEXT: b.eq LBB8_1
-; CHECK-NEXT: ; %bb.2: ; %exit
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: LBB8_1: ; %loop
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ldr q0, [x0, x8, lsl #4]
+; CHECK-NEXT: xtn.8b v0, v0
+; CHECK-NEXT: str d0, [x1, x8, lsl #3]
+; CHECK-NEXT: add x8, x8, #1
+; CHECK-NEXT: cmp x8, #1000
+; CHECK-NEXT: b.eq LBB8_1
+; CHECK-NEXT: ; %bb.2: ; %exit
+; CHECK-NEXT: ret
+
; CHECK-BE-LABEL: trunc_v8i16_to_v8i8_in_loop:
; CHECK-BE: // %bb.0: // %entry
-; CHECK-BE-NEXT: mov x8, xzr
-; CHECK-BE-NEXT: .LBB8_1: // %loop
-; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-BE-NEXT: add x9, x0, x8, lsl #4
-; CHECK-BE-NEXT: ld1 { v0.8h }, [x9]
-; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
-; CHECK-BE-NEXT: add x8, x8, #1
-; CHECK-BE-NEXT: cmp x8, #1000
-; CHECK-BE-NEXT: xtn v0.8b, v0.8h
-; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
-; CHECK-BE-NEXT: b.eq .LBB8_1
-; CHECK-BE-NEXT: // %bb.2: // %exit
-; CHECK-BE-NEXT: ret
+; CHECK-BE-NEXT: mov x8, xzr
+; CHECK-BE-NEXT: .LBB8_1: // %loop
+; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-BE-NEXT: add x9, x0, x8, lsl #4
+; CHECK-BE-NEXT: ld1 { v0.8h }, [x9]
+; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
+; CHECK-BE-NEXT: add x8, x8, #1
+; CHECK-BE-NEXT: cmp x8, #1000
+; CHECK-BE-NEXT: xtn v0.8b, v0.8h
+; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: b.eq .LBB8_1
+; CHECK-BE-NEXT: // %bb.2: // %exit
+; CHECK-BE-NEXT: ret
+
entry:
br label %loop
More information about the llvm-commits
mailing list