[llvm] 7b7ac4b - [X86] Expose memory codegen in element insert load tests to improve accuracy of checks

Sun Aug 22 06:55:21 PDT 2021

Author: Simon Pilgrim
Date: 2021-08-22T14:54:48+01:00
New Revision: 7b7ac4b16a2f2bab1f961e3b362c6a231fa923d2

URL: https://github.com/llvm/llvm-project/commit/7b7ac4b16a2f2bab1f961e3b362c6a231fa923d2
DIFF: https://github.com/llvm/llvm-project/commit/7b7ac4b16a2f2bab1f961e3b362c6a231fa923d2.diff

LOG: [X86] Expose memory codegen in element insert load tests to improve accuracy of checks

Also replace X32 with X86 check prefixes for i686 tests (we tend to try to use X32 for gnux32 targets)

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/avx.ll
    llvm/test/CodeGen/X86/sse41.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index a176edba13aa..cc6f839e5d5c 100644

--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X32
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
 
 define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
@@ -43,16 +43,15 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; On X32, account for the argument's move to registers
-; X32-LABEL: insertps_from_vector_load:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_vector_load:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -61,17 +60,15 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
 
 ;; Use a non-zero CountS for insertps
 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; On X32, account for the argument's move to registers
-;; Try to match a bit more of the instr, since we need the load's offset.
-; X32-LABEL: insertps_from_vector_load_offset:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_vector_load_offset:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X64-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -79,20 +76,18 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
 }
 
 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
-; On X32, account for the argument's move to registers
-;; Try to match a bit more of the instr, since we need the load's offset.
-; X32-LABEL: insertps_from_vector_load_offset_2:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_vector_load_offset_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
   %2 = load <4 x float>, <4 x float>* %1, align 16
@@ -101,17 +96,16 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 }
 
 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
-; On X32, account for the arguments' move to registers
-; X32-LABEL: insertps_from_broadcast_loadf32:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_broadcast_loadf32:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_loadf32:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds float, float* %fb, i64 %index
   %2 = load float, float* %1, align 4
@@ -124,16 +118,15 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
 }
 
 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
-; On X32, account for the arguments' move to registers
-; X32-LABEL: insertps_from_broadcast_loadv4f32:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_broadcast_loadv4f32:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %b, align 4
   %2 = extractelement <4 x float> %1, i32 0
@@ -147,20 +140,19 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
 
 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
-; On X32, account for the arguments' move to registers
-; X32-LABEL: insertps_from_broadcast_multiple_use:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
-; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_broadcast_multiple_use:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
+; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 ; X64:       ## %bb.0:

diff  --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 17aae3373c5e..d15e157dc5d2 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512