[llvm] 06f136f - [instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 18 14:55:17 PDT 2020
Author: Philip Reames
Date: 2020-09-18T14:54:24-07:00
New Revision: 06f136f61e6d23fde5c91f7fa0813d0291c17c97
URL: https://github.com/llvm/llvm-project/commit/06f136f61e6d23fde5c91f7fa0813d0291c17c97
DIFF: https://github.com/llvm/llvm-project/commit/06f136f61e6d23fde5c91f7fa0813d0291c17c97.diff
LOG: [instcombine][x86] Converted pdep/pext with shifted mask to simple arithmetic
If the mask of a pdep or pext instruction is a shift masked (i.e. one contiguous block of ones) we need at most one and and one shift to represent the operation without the intrinsic. One all platforms I know of, this is faster than the pdep/pext.
The cost modelling for multiple contiguous blocks might be worth exploring in a follow up, but it's not relevant for my current use case. It would almost certainly be a win on AMDs where these are really really slow though.
Differential Revision: https://reviews.llvm.org/D87861
Added:
Modified:
llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 94ee79901075..10f0018a0f71 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -999,6 +999,20 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
+ Value *Shifted = IC.Builder.CreateLShr(Masked,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ return IC.replaceInstUsesWith(II, Shifted);
+ }
+
+
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
uint64_t Mask = MaskC->getZExtValue();
@@ -1030,6 +1044,18 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (MaskC->isAllOnesValue()) {
return IC.replaceInstUsesWith(II, II.getArgOperand(0));
}
+ if (MaskC->getValue().isShiftedMask()) {
+ // any single contingous sequence of 1s anywhere in the mask simply
+ // describes a subset of the input bits shifted to the appropriate
+ // position. Replace with the straight forward IR.
+ unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
+ Value *Input = II.getArgOperand(0);
+ Value *Shifted = IC.Builder.CreateShl(Input,
+ ConstantInt::get(II.getType(),
+ ShiftAmount));
+ Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
+ return IC.replaceInstUsesWith(II, Masked);
+ }
if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
uint64_t Src = SrcC->getZExtValue();
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
index 8d04eca11485..b7f814630c8a 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-bmi-tbm.ll
@@ -306,6 +306,27 @@ define i64 @test_x86_pext_64_allones_mask(i64 %x) nounwind readnone {
ret i64 %1
}
+define i32 @test_x86_pext_32_shifted_mask(i32 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pext_32_shifted_mask(
+; CHECK-NEXT: %1 = lshr i32 %x, 1
+; CHECK-NEXT: %2 = and i32 %1, 3
+; CHECK-NEXT: ret i32 %2
+;
+ %1 = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 6)
+ ret i32 %1
+}
+
+define i64 @test_x86_pext_64_shifted_mask(i64 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pext_64_shifted_mask(
+; CHECK-NEXT: %1 = lshr i64 %x, 1
+; CHECK-NEXT: %2 = and i64 %1, 3
+; CHECK-NEXT: ret i64 %2
+;
+ %1 = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 6)
+ ret i64 %1
+}
+
+
define i32 @test_x86_pext_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pext_32_constant_fold(
; CHECK-NEXT: ret i32 30001
@@ -370,6 +391,27 @@ define i64 @test_x86_pdep_64_allones_mask(i64 %x) nounwind readnone {
ret i64 %1
}
+define i32 @test_x86_pdep_32_shifted_mask(i32 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pdep_32_shifted_mask(
+; CHECK-NEXT: %1 = shl i32 %x, 2
+; CHECK-NEXT: %2 = and i32 %1, 12
+; CHECK-NEXT: ret i32 %2
+;
+ %1 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 12)
+ ret i32 %1
+}
+
+define i64 @test_x86_pdep_64_shifted_mask(i64 %x) nounwind readnone {
+; CHECK-LABEL: @test_x86_pdep_64_shifted_mask(
+; CHECK-NEXT: %1 = shl i64 %x, 2
+; CHECK-NEXT: %2 = and i64 %1, 12
+; CHECK-NEXT: ret i64 %2
+;
+ %1 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 12)
+ ret i64 %1
+}
+
+
define i32 @test_x86_pdep_32_constant_fold() nounwind readnone {
; CHECK-LABEL: @test_x86_pdep_32_constant_fold(
; CHECK-NEXT: ret i32 807407616
More information about the llvm-commits
mailing list