[clang] [llvm] InstCombine: Order shufflevector operands by complexity (PR #113212)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 21 13:18:55 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Matthias Braun (MatzeB)
<details>
<summary>Changes</summary>
As shufflevector is effectively commutative we should apply the same
logic as other commutative operations where we order the inputs by
their `getComplexity()` value. This will put things like `undef`,
`poison` and constants on the right hand side where possible.
We had a rule moving `undef` to the right hand side that is superseded
by this.
---
Patch is 55.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/113212.diff
10 Files Affected:
- (modified) clang/test/CodeGen/X86/avx-shuffle-builtins.c (+450-72)
- (modified) clang/test/CodeGen/X86/sse.c (+5-6)
- (modified) llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp (+18-13)
- (modified) llvm/test/Analysis/ValueTracking/knownbits-x86-hadd-hsub.ll (+1-1)
- (modified) llvm/test/Transforms/InstCombine/insert-extract-shuffle-inseltpoison.ll (+2-2)
- (modified) llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll (+3-3)
- (modified) llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll (+1-1)
- (modified) llvm/test/Transforms/InstCombine/vec_shuffle.ll (+41-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll (+1-1)
``````````diff
diff --git a/clang/test/CodeGen/X86/avx-shuffle-builtins.c b/clang/test/CodeGen/X86/avx-shuffle-builtins.c
index d184d28f3e07aa..8d5b2c1d8c4394 100644
--- a/clang/test/CodeGen/X86/avx-shuffle-builtins.c
+++ b/clang/test/CodeGen/X86/avx-shuffle-builtins.c
@@ -1,7 +1,7 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: x86-registered-target
-// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
-// RUN: %clang_cc1 -ffreestanding %s -O3 -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X86
-// FIXME: This is testing optimized generation of shuffle instructions and should be fixed.
+// RUN: %clang_cc1 -ffreestanding %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -ffreestanding %s -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
#include <immintrin.h>
@@ -10,201 +10,579 @@
// Test LLVM IR codegen of shuffle instructions, checking if the masks are correct
//
+// CHECK-LABEL: define dso_local <8 x float> @x(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[SHUFP:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
+// CHECK-NEXT: ret <8 x float> [[SHUFP]]
+//
__m256 x(__m256 a, __m256 b) {
- // CHECK-LABEL: x
- // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15>
return _mm256_shuffle_ps(a, b, 203);
}
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_permute_pd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+// CHECK-NEXT: ret <2 x double> [[PERMIL]]
+//
__m128d test_mm_permute_pd(__m128d a) {
- // CHECK-LABEL: test_mm_permute_pd
- // CHECK: shufflevector{{.*}}<i32 1, i32 0>
return _mm_permute_pd(a, 1);
}
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute_pd(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK-NEXT: ret <4 x double> [[PERMIL]]
+//
__m256d test_mm256_permute_pd(__m256d a) {
- // CHECK-LABEL: test_mm256_permute_pd
- // CHECK: shufflevector{{.*}}<i32 1, i32 0, i32 3, i32 2>
return _mm256_permute_pd(a, 5);
}
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT: ret <4 x float> [[PERMIL]]
+//
__m128 test_mm_permute_ps(__m128 a) {
- // CHECK-LABEL: test_mm_permute_ps
- // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0>
return _mm_permute_ps(a, 0x1b);
}
// Test case for PR12401
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_permute_ps2(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT: store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+// CHECK-NEXT: ret <4 x float> [[PERMIL]]
+//
__m128 test_mm_permute_ps2(__m128 a) {
- // CHECK-LABEL: test_mm_permute_ps2
- // CHECK: shufflevector{{.*}}<i32 2, i32 1, i32 2, i32 3>
return _mm_permute_ps(a, 0xe6);
}
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute_ps(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT: ret <8 x float> [[PERMIL]]
+//
__m256 test_mm256_permute_ps(__m256 a) {
- // CHECK-LABEL: test_mm256_permute_ps
- // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
return _mm256_permute_ps(a, 0x1b);
}
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_permute2f128_pd(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+// CHECK-NEXT: ret <4 x double> [[VPERM]]
+//
__m256d test_mm256_permute2f128_pd(__m256d a, __m256d b) {
- // CHECK-LABEL: test_mm256_permute2f128_pd
- // CHECK: shufflevector{{.*}}<i32 2, i32 3, i32 6, i32 7>
return _mm256_permute2f128_pd(a, b, 0x31);
}
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_permute2f128_ps(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP0]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT: ret <8 x float> [[VPERM]]
+//
__m256 test_mm256_permute2f128_ps(__m256 a, __m256 b) {
- // CHECK-LABEL: test_mm256_permute2f128_ps
- // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
return _mm256_permute2f128_ps(a, b, 0x13);
}
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_permute2f128_si256(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to <8 x i32>
+// CHECK-NEXT: [[VPERM:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[VPERM]] to <4 x i64>
+// CHECK-NEXT: ret <4 x i64> [[TMP4]]
+//
__m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) {
- // CHECK-LABEL: test_mm256_permute2f128_si256
- // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
return _mm256_permute2f128_si256(a, b, 0x20);
}
__m128
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_broadcast_ss(
+// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[__F_I:%.*]] = alloca float, align 4
+// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1
+// CHECK-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[TMP4]], i32 1
+// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[TMP5]], i32 2
+// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT3_I]], float [[TMP6]], i32 3
+// CHECK-NEXT: store <4 x float> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT: ret <4 x float> [[TMP7]]
+//
test_mm_broadcast_ss(float const *__a) {
- // CHECK-LABEL: test_mm_broadcast_ss
- // CHECK: insertelement <4 x float> {{.*}}, i64 0
- // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> poison, <4 x i32> zeroinitializer
return _mm_broadcast_ss(__a);
}
__m256d
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_broadcast_sd(
+// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[__D_I:%.*]] = alloca double, align 8
+// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1
+// CHECK-NEXT: store double [[TMP2]], ptr [[__D_I]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[__D_I]], align 8
+// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i32 0
+// CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[__D_I]], align 8
+// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP4]], i32 1
+// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[__D_I]], align 8
+// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP5]], i32 2
+// CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[__D_I]], align 8
+// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <4 x double> [[VECINIT3_I]], double [[TMP6]], i32 3
+// CHECK-NEXT: store <4 x double> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT: [[TMP7:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT: ret <4 x double> [[TMP7]]
+//
test_mm256_broadcast_sd(double const *__a) {
- // CHECK-LABEL: test_mm256_broadcast_sd
- // CHECK: insertelement <4 x double> {{.*}}, i64 0
- // CHECK: shufflevector <4 x double> {{.*}}, <4 x double> poison, <4 x i32> zeroinitializer
return _mm256_broadcast_sd(__a);
}
__m256
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_broadcast_ss(
+// CHECK-SAME: ptr noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[__A_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[__F_I:%.*]] = alloca float, align 4
+// CHECK-NEXT: [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: [[__A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[__A]], ptr [[__A_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1
+// CHECK-NEXT: store float [[TMP2]], ptr [[__F_I]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP4]], i32 1
+// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 2
+// CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 3
+// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 4
+// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 5
+// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 6
+// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[__F_I]], align 4
+// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <8 x float> [[VECINIT7_I]], float [[TMP10]], i32 7
+// CHECK-NEXT: store <8 x float> [[VECINIT8_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT: [[TMP11:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT: ret <8 x float> [[TMP11]]
+//
test_mm256_broadcast_ss(float const *__a) {
- // CHECK-LABEL: test_mm256_broadcast_ss
- // CHECK: insertelement <8 x float> {{.*}}, i64 0
- // CHECK: shufflevector <8 x float> {{.*}}, <8 x float> poison, <8 x i32> zeroinitializer
return _mm256_broadcast_ss(__a);
}
// Make sure we have the correct mask for each insertf128 case.
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_0(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT: ret <8 x float> [[INSERT]]
+//
__m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
- // CHECK-LABEL: test_mm256_insertf128_ps_0
- // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
return _mm256_insertf128_ps(a, b, 0);
}
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_insertf128_pd_0(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT: store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// CHECK-NEXT: ret <4 x double> [[INSERT]]
+//
__m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
- // CHECK-LABEL: test_mm256_insertf128_pd_0
- // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7>
return _mm256_insertf128_pd(a, b, 0);
}
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_insertf128_si256_0(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT: store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// CHECK-NEXT: [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT: [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64>
+// CHECK-NEXT: ret <4 x i64> [[TMP4]]
+//
__m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
- // CHECK-LABEL: test_mm256_insertf128_si256_0
- // X64: shufflevector{{.*}}<i32 0, i32 1, i32 6, i32 7>
- // X86: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
return _mm256_insertf128_si256(a, b, 0);
}
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_insertf128_ps_1(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT: store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT: store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = l...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/113212
More information about the llvm-commits
mailing list