[llvm] 9d09d20 - Reapply "[X86] Limit X86InterleavedAccessGroup to handle the same type case only"
via llvm-commits
llvm-commits at lists.llvm.org
Wed May 19 07:50:44 PDT 2021
Author: Wang, Pengfei
Date: 2021-05-19T22:27:16+08:00
New Revision: 9d09d20448e48c78035c40982646b7b26fee88c3
URL: https://github.com/llvm/llvm-project/commit/9d09d20448e48c78035c40982646b7b26fee88c3
DIFF: https://github.com/llvm/llvm-project/commit/9d09d20448e48c78035c40982646b7b26fee88c3.diff
LOG: Reapply "[X86] Limit X86InterleavedAccessGroup to handle the same type case only"
The current implementation assumes the destination type of shuffle is the same as the decomposed ones. Add the check to avoid crush when the condition is not satisfied.
This fixes PR37616.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D102751
Added:
Modified:
llvm/lib/Target/X86/X86InterleavedAccess.cpp
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 95655dd4723bc..40174a1297b77 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -724,30 +724,34 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
if (isa<LoadInst>(Inst)) {
- // Try to generate target-sized register(/instruction).
- decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
-
auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
- // Perform matrix-transposition in order to compute interleaved
- // results by generating some sort of (optimized) target-specific
- // instructions.
-
switch (NumSubVecElems) {
default:
return false;
case 4:
- transpose_4x4(DecomposedVectors, TransposedVectors);
- break;
case 8:
case 16:
case 32:
case 64:
- deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
- NumSubVecElems);
+ if (ShuffleTy->getNumElements() != NumSubVecElems)
+ return false;
break;
}
+ // Try to generate target-sized register(/instruction).
+ decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+ // Perform matrix-transposition in order to compute interleaved
+ // results by generating some sort of (optimized) target-specific
+ // instructions.
+
+ if (NumSubVecElems == 4)
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ else
+ deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+
// Now replace the unoptimized-interleaved-vectors with the
// transposed-interleaved vectors.
for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index f99b065bc2ecc..32598cdcbf088 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1930,3 +1930,22 @@ define void @splat4_v4i64_load_store(<4 x i64>* %s, <16 x i64>* %d) {
store <16 x i64> %r, <16 x i64>* %d, align 8
ret void
}
+
+define <2 x i64> @PR37616(<16 x i64>* %a0) {
+; AVX1-LABEL: PR37616:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovaps 16(%rdi), %xmm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT: retq
+;
+; AVX2OR512-LABEL: PR37616:
+; AVX2OR512: # %bb.0:
+; AVX2OR512-NEXT: vmovaps (%rdi), %ymm0
+; AVX2OR512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX2OR512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2OR512-NEXT: vzeroupper
+; AVX2OR512-NEXT: retq
+ %load = load <16 x i64>, <16 x i64>* %a0, align 128
+ %shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
+ ret <2 x i64> %shuffle
+}
More information about the llvm-commits
mailing list