[llvm] 8491d01 - [AArch64] Lower vector trunc using tbl.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 16 04:43:22 PDT 2022
Author: Florian Hahn
Date: 2022-09-16T12:42:49+01:00
New Revision: 8491d01cc385d08b8b4f5dd097239ea0009ddc63
URL: https://github.com/llvm/llvm-project/commit/8491d01cc385d08b8b4f5dd097239ea0009ddc63
DIFF: https://github.com/llvm/llvm-project/commit/8491d01cc385d08b8b4f5dd097239ea0009ddc63.diff
LOG: [AArch64] Lower vector trunc using tbl.
Similar to using tbl to lower vector ZExts, tbl4 can be used to lower
vector truncates.
The initial version support i32->i8 conversions.
Depends on D120571
Reviewed By: t.p.northover
Differential Revision: https://reviews.llvm.org/D133495
Added:
Modified:
llvm/lib/CodeGen/CodeGenPrepare.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 45416edb3011..17dac33840a6 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8047,8 +8047,9 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
return true;
- if (isa<UIToFPInst>(I) && TLI->optimizeExtendOrTruncateConversion(
- I, LI->getLoopFor(I->getParent())))
+ if ((isa<UIToFPInst>(I) || isa<TruncInst>(I)) &&
+ TLI->optimizeExtendOrTruncateConversion(I,
+ LI->getLoopFor(I->getParent())))
return true;
if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 26fbcc71a555..16926e8f5688 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -13209,6 +13209,44 @@ static void createTblShuffleForZExt(ZExtInst *ZExt, bool IsLittleEndian) {
ZExt->eraseFromParent();
}
+static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
+ IRBuilder<> Builder(TI);
+ SmallVector<Value *> Parts;
+ Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
+ Parts.push_back(Builder.CreateBitCast(
+ Builder.CreateShuffleVector(TI->getOperand(0), {0, 1, 2, 3}), VecTy));
+ Parts.push_back(Builder.CreateBitCast(
+ Builder.CreateShuffleVector(TI->getOperand(0), {4, 5, 6, 7}), VecTy));
+
+ Intrinsic::ID TblID = Intrinsic::aarch64_neon_tbl2;
+ unsigned NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
+ if (NumElements == 16) {
+ Parts.push_back(Builder.CreateBitCast(
+ Builder.CreateShuffleVector(TI->getOperand(0), {8, 9, 10, 11}), VecTy));
+ Parts.push_back(Builder.CreateBitCast(
+ Builder.CreateShuffleVector(TI->getOperand(0), {12, 13, 14, 15}),
+ VecTy));
+ TblID = Intrinsic::aarch64_neon_tbl4;
+ }
+ SmallVector<Constant *, 16> MaskConst;
+ for (unsigned Idx = 0; Idx < NumElements * 4; Idx += 4)
+ MaskConst.push_back(
+ ConstantInt::get(Builder.getInt8Ty(), IsLittleEndian ? Idx : Idx + 3));
+
+ for (unsigned Idx = NumElements * 4; Idx < 64; Idx += 4)
+ MaskConst.push_back(ConstantInt::get(Builder.getInt8Ty(), 255));
+
+ Parts.push_back(ConstantVector::get(MaskConst));
+ auto *F =
+ Intrinsic::getDeclaration(TI->getModule(), TblID, Parts[0]->getType());
+ Value *Res = Builder.CreateCall(F, Parts);
+
+ if (NumElements == 8)
+ Res = Builder.CreateShuffleVector(Res, {0, 1, 2, 3, 4, 5, 6, 7});
+ TI->replaceAllUsesWith(Res);
+ TI->eraseFromParent();
+}
+
bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
Loop *L) const {
// Try to optimize conversions using tbl. This requires materializing constant
@@ -13250,6 +13288,18 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(Instruction *I,
createTblShuffleForZExt(ZExt, Subtarget->isLittleEndian());
return true;
}
+
+ // Convert 'trunc <(8|16) x i32> %x to <(8|16) x i8>' to a single tbl.4
+ // instruction selecting the lowest 8 bits per lane of the input interpreted
+ // as 2 or 4 <4 x i32> vectors.
+ auto *TI = dyn_cast<TruncInst>(I);
+ if (TI && (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
+ SrcTy->getElementType()->isIntegerTy(32) &&
+ DstTy->getElementType()->isIntegerTy(8)) {
+ createTblForTrunc(TI, Subtarget->isLittleEndian());
+ return true;
+ }
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 80aff21dc86e..76488f588177 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -2,49 +2,90 @@
; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
; RUN: llc -mtriple=aarch64_be-unknown-linux -o - %s | FileCheck --check-prefix=CHECK-BE %s
+; CHECK-LABEL: lCPI0_0:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 4 ; 0x4
+; CHECK-NEXT: .byte 8 ; 0x8
+; CHECK-NEXT: .byte 12 ; 0xc
+; CHECK-NEXT: .byte 16 ; 0x10
+; CHECK-NEXT: .byte 20 ; 0x14
+; CHECK-NEXT: .byte 24 ; 0x18
+; CHECK-NEXT: .byte 28 ; 0x1c
+; CHECK-NEXT: .byte 32 ; 0x20
+; CHECK-NEXT: .byte 36 ; 0x24
+; CHECK-NEXT: .byte 40 ; 0x28
+; CHECK-NEXT: .byte 44 ; 0x2c
+; CHECK-NEXT: .byte 48 ; 0x30
+; CHECK-NEXT: .byte 52 ; 0x34
+; CHECK-NEXT: .byte 56 ; 0x38
+; CHECK-NEXT: .byte 60 ; 0x3c
+
+; CHECK-BE-LABEL: .LCPI0_0:
+; CHECK-BE-NEXT: .byte 3 // 0x3
+; CHECK-BE-NEXT: .byte 7 // 0x7
+; CHECK-BE-NEXT: .byte 11 // 0xb
+; CHECK-BE-NEXT: .byte 15 // 0xf
+; CHECK-BE-NEXT: .byte 19 // 0x13
+; CHECK-BE-NEXT: .byte 23 // 0x17
+; CHECK-BE-NEXT: .byte 27 // 0x1b
+; CHECK-BE-NEXT: .byte 31 // 0x1f
+; CHECK-BE-NEXT: .byte 35 // 0x23
+; CHECK-BE-NEXT: .byte 39 // 0x27
+; CHECK-BE-NEXT: .byte 43 // 0x2b
+; CHECK-BE-NEXT: .byte 47 // 0x2f
+; CHECK-BE-NEXT: .byte 51 // 0x33
+; CHECK-BE-NEXT: .byte 55 // 0x37
+; CHECK-BE-NEXT: .byte 59 // 0x3b
+; CHECK-BE-NEXT: .byte 63 // 0x3f
+
; It's profitable to use a single tbl.4 instruction to lower the truncate.
define void @trunc_v16i32_to_v16i8_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: trunc_v16i32_to_v16i8_in_loop:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: Lloh0:
+; CHECK-NEXT: adrp x9, lCPI0_0 at PAGE
; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: Lloh1:
+; CHECK-NEXT: ldr q0, [x9, lCPI0_0 at PAGEOFF]
; CHECK-NEXT: LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8, lsl #6
-; CHECK-NEXT: ldp q1, q0, [x9, #32]
-; CHECK-NEXT: ldp q3, q2, [x9]
-; CHECK-NEXT: uzp1.8h v0, v1, v0
-; CHECK-NEXT: uzp1.8h v1, v3, v2
-; CHECK-NEXT: uzp1.16b v0, v1, v0
-; CHECK-NEXT: str q0, [x1, x8, lsl #4]
+; CHECK-NEXT: ldp q1, q2, [x9]
+; CHECK-NEXT: ldp q3, q4, [x9, #32]
+; CHECK-NEXT: tbl.16b v1, { v1, v2, v3, v4 }, v0
+; CHECK-NEXT: str q1, [x1, x8, lsl #4]
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: b.eq LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1
;
; CHECK-BE-LABEL: trunc_v16i32_to_v16i8_in_loop:
; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: adrp x8, .LCPI0_0
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT: mov x8, xzr
; CHECK-BE-NEXT: .LBB0_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: add x9, x0, x8, lsl #6
-; CHECK-BE-NEXT: add x10, x9, #48
+; CHECK-BE-NEXT: add x10, x9, #16
; CHECK-BE-NEXT: add x11, x9, #32
-; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]
-; CHECK-BE-NEXT: add x9, x9, #16
-; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]
-; CHECK-BE-NEXT: ld1 { v2.4s }, [x11]
-; CHECK-BE-NEXT: ld1 { v3.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
+; CHECK-BE-NEXT: add x9, x9, #48
+; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
+; CHECK-BE-NEXT: ld1 { v3.16b }, [x11]
+; CHECK-BE-NEXT: ld1 { v4.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, x8, lsl #4
; CHECK-BE-NEXT: add x8, x8, #1
; CHECK-BE-NEXT: cmp x8, #1000
-; CHECK-BE-NEXT: uzp1 v1.8h, v2.8h, v1.8h
-; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v3.8h
-; CHECK-BE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
-; CHECK-BE-NEXT: st1 { v0.16b }, [x9]
+; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b, v3.16b, v4.16b }, v0.16b
+; CHECK-BE-NEXT: st1 { v1.16b }, [x9]
; CHECK-BE-NEXT: b.eq .LBB0_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
+
entry:
br label %loop
@@ -97,42 +138,85 @@ entry:
ret void
}
+
+; CHECK-LABEL: lCPI2_0:
+; CHECK-NEXT: .byte 0 ; 0x0
+; CHECK-NEXT: .byte 4 ; 0x4
+; CHECK-NEXT: .byte 8 ; 0x8
+; CHECK-NEXT: .byte 12 ; 0xc
+; CHECK-NEXT: .byte 16 ; 0x10
+; CHECK-NEXT: .byte 20 ; 0x14
+; CHECK-NEXT: .byte 24 ; 0x18
+; CHECK-NEXT: .byte 28 ; 0x1c
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+; CHECK-NEXT: .byte 255 ; 0xff
+
+; CHECK-BE-LABEL: .LCPI2_0:
+; CHECK-BE-NEXT: .byte 3 // 0x3
+; CHECK-BE-NEXT: .byte 7 // 0x7
+; CHECK-BE-NEXT: .byte 11 // 0xb
+; CHECK-BE-NEXT: .byte 15 // 0xf
+; CHECK-BE-NEXT: .byte 19 // 0x13
+; CHECK-BE-NEXT: .byte 23 // 0x17
+; CHECK-BE-NEXT: .byte 27 // 0x1b
+; CHECK-BE-NEXT: .byte 31 // 0x1f
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
+; CHECK-BE-NEXT: .byte 255 // 0xff
; It's profitable to use a single tbl.2 instruction to lower the truncate.
define void @trunc_v8i32_to_v8i8_in_loop(ptr %A, ptr %dst) {
; CHECK-LABEL: trunc_v8i32_to_v8i8_in_loop:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: Lloh2:
+; CHECK-NEXT: adrp x9, lCPI2_0 at PAGE
; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: Lloh3:
+; CHECK-NEXT: ldr q0, [x9, lCPI2_0 at PAGEOFF]
; CHECK-NEXT: LBB2_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: add x9, x0, x8, lsl #5
-; CHECK-NEXT: ldp q1, q0, [x9]
-; CHECK-NEXT: uzp1.8h v0, v1, v0
-; CHECK-NEXT: xtn.8b v0, v0
-; CHECK-NEXT: str d0, [x1, x8, lsl #3]
+; CHECK-NEXT: ldp q1, q2, [x9]
+; CHECK-NEXT: tbl.16b v1, { v1, v2 }, v0
+; CHECK-NEXT: str d1, [x1, x8, lsl #3]
; CHECK-NEXT: add x8, x8, #1
; CHECK-NEXT: cmp x8, #1000
; CHECK-NEXT: b.eq LBB2_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: ret
+; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
;
; CHECK-BE-LABEL: trunc_v8i32_to_v8i8_in_loop:
; CHECK-BE: // %bb.0: // %entry
+; CHECK-BE-NEXT: adrp x8, .LCPI2_0
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI2_0
+; CHECK-BE-NEXT: ld1 { v0.16b }, [x8]
; CHECK-BE-NEXT: mov x8, xzr
; CHECK-BE-NEXT: .LBB2_1: // %loop
; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-BE-NEXT: add x9, x0, x8, lsl #5
; CHECK-BE-NEXT: add x10, x9, #16
-; CHECK-BE-NEXT: ld1 { v0.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v1.16b }, [x9]
; CHECK-BE-NEXT: add x9, x1, x8, lsl #3
; CHECK-BE-NEXT: add x8, x8, #1
-; CHECK-BE-NEXT: ld1 { v1.4s }, [x10]
+; CHECK-BE-NEXT: ld1 { v2.16b }, [x10]
; CHECK-BE-NEXT: cmp x8, #1000
-; CHECK-BE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-BE-NEXT: xtn v0.8b, v0.8h
-; CHECK-BE-NEXT: st1 { v0.8b }, [x9]
+; CHECK-BE-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v0.16b
+; CHECK-BE-NEXT: st1 { v1.8b }, [x9]
; CHECK-BE-NEXT: b.eq .LBB2_1
; CHECK-BE-NEXT: // %bb.2: // %exit
; CHECK-BE-NEXT: ret
+
entry:
br label %loop
More information about the llvm-commits
mailing list