[llvm] 13b4db4 - [X86][SSE] Expose all memory offsets in expand load tests
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 6 03:28:54 PDT 2020
Author: Simon Pilgrim
Date: 2020-08-06T11:28:42+01:00
New Revision: 13b4db4ec20649e2c32f9bee426c2fc555275d25
URL: https://github.com/llvm/llvm-project/commit/13b4db4ec20649e2c32f9bee426c2fc555275d25
DIFF: https://github.com/llvm/llvm-project/commit/13b4db4ec20649e2c32f9bee426c2fc555275d25.diff
LOG: [X86][SSE] Expose all memory offsets in expand load tests
Since we're messing with individual element loads we need to expose this to show whats going on.
Part of the work to fix the masked_expandload.ll regressions in D66004
Added:
Modified:
llvm/test/CodeGen/X86/masked_expandload.ll
llvm/test/CodeGen/X86/pr39666.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 344f300b959c..d627351a0c34 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
@@ -27,12 +27,12 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, <
; SSE2-NEXT: LBB0_4: ## %else2
; SSE2-NEXT: retq
; SSE2-NEXT: LBB0_1: ## %cond.load
-; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT: movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB0_4
; SSE2-NEXT: LBB0_3: ## %cond.load1
-; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE2-NEXT: movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: retq
;
; SSE42-LABEL: expandload_v2f64_v2i64:
@@ -48,12 +48,12 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, <
; SSE42-NEXT: LBB0_4: ## %else2
; SSE42-NEXT: retq
; SSE42-NEXT: LBB0_1: ## %cond.load
-; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE42-NEXT: movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB0_4
; SSE42-NEXT: LBB0_3: ## %cond.load1
-; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE42-NEXT: movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: expandload_v2f64_v2i64:
@@ -69,12 +69,12 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, <
; AVX1OR2-NEXT: LBB0_4: ## %else2
; AVX1OR2-NEXT: retq
; AVX1OR2-NEXT: LBB0_1: ## %cond.load
-; AVX1OR2-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX1OR2-NEXT: vmovlps (%rdi), %xmm0, %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; AVX1OR2-NEXT: addq $8, %rdi
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: je LBB0_4
; AVX1OR2-NEXT: LBB0_3: ## %cond.load1
-; AVX1OR2-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX1OR2-NEXT: vmovhps (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: expandload_v2f64_v2i64:
@@ -125,22 +125,22 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; SSE2-NEXT: LBB1_8: ## %else10
; SSE2-NEXT: retq
; SSE2-NEXT: LBB1_1: ## %cond.load
-; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT: movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB1_4
; SSE2-NEXT: LBB1_3: ## %cond.load1
-; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE2-NEXT: movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB1_6
; SSE2-NEXT: LBB1_5: ## %cond.load5
-; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; SSE2-NEXT: movlps (%rdi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
; SSE2-NEXT: addq $8, %rdi
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB1_8
; SSE2-NEXT: LBB1_7: ## %cond.load9
-; SSE2-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSE2-NEXT: movhps (%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
; SSE2-NEXT: retq
;
; SSE42-LABEL: expandload_v4f64_v4i64:
@@ -164,22 +164,22 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; SSE42-NEXT: LBB1_8: ## %else10
; SSE42-NEXT: retq
; SSE42-NEXT: LBB1_1: ## %cond.load
-; SSE42-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE42-NEXT: movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB1_4
; SSE42-NEXT: LBB1_3: ## %cond.load1
-; SSE42-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE42-NEXT: movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: testb $4, %al
; SSE42-NEXT: je LBB1_6
; SSE42-NEXT: LBB1_5: ## %cond.load5
-; SSE42-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; SSE42-NEXT: movlps (%rdi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
; SSE42-NEXT: addq $8, %rdi
; SSE42-NEXT: testb $8, %al
; SSE42-NEXT: je LBB1_8
; SSE42-NEXT: LBB1_7: ## %cond.load9
-; SSE42-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSE42-NEXT: movhps (%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
; SSE42-NEXT: retq
;
; AVX1-LABEL: expandload_v4f64_v4i64:
@@ -204,27 +204,27 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; AVX1-NEXT: LBB1_8: ## %else10
; AVX1-NEXT: retq
; AVX1-NEXT: LBB1_1: ## %cond.load
-; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB1_4
; AVX1-NEXT: LBB1_3: ## %cond.load1
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
+; AVX1-NEXT: vmovhpd (%rdi), %xmm0, %xmm1 ## xmm1 = xmm0[0],mem[0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB1_6
; AVX1-NEXT: LBB1_5: ## %cond.load5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; AVX1-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB1_8
; AVX1-NEXT: LBB1_7: ## %cond.load9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -247,27 +247,27 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; AVX2-NEXT: LBB1_8: ## %else10
; AVX2-NEXT: retq
; AVX2-NEXT: LBB1_1: ## %cond.load
-; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB1_4
; AVX2-NEXT: LBB1_3: ## %cond.load1
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
+; AVX2-NEXT: vmovhpd (%rdi), %xmm0, %xmm1 ## xmm1 = xmm0[0],mem[0]
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB1_6
; AVX2-NEXT: LBB1_5: ## %cond.load5
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; AVX2-NEXT: vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB1_8
; AVX2-NEXT: LBB1_7: ## %cond.load9
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -324,42 +324,42 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; SSE-NEXT: LBB2_16: ## %else26
; SSE-NEXT: retq
; SSE-NEXT: LBB2_1: ## %cond.load
-; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE-NEXT: movlps (%rdi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $2, %al
; SSE-NEXT: je LBB2_4
; SSE-NEXT: LBB2_3: ## %cond.load1
-; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $4, %al
; SSE-NEXT: je LBB2_6
; SSE-NEXT: LBB2_5: ## %cond.load5
-; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; SSE-NEXT: movlps (%rdi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $8, %al
; SSE-NEXT: je LBB2_8
; SSE-NEXT: LBB2_7: ## %cond.load9
-; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $16, %al
; SSE-NEXT: je LBB2_10
; SSE-NEXT: LBB2_9: ## %cond.load13
-; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; SSE-NEXT: movlps (%rdi), %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $32, %al
; SSE-NEXT: je LBB2_12
; SSE-NEXT: LBB2_11: ## %cond.load17
-; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $64, %al
; SSE-NEXT: je LBB2_14
; SSE-NEXT: LBB2_13: ## %cond.load21
-; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
+; SSE-NEXT: movlps (%rdi), %xmm3 ## xmm3 = mem[0,1],xmm3[2,3]
; SSE-NEXT: addq $8, %rdi
; SSE-NEXT: testb $-128, %al
; SSE-NEXT: je LBB2_16
; SSE-NEXT: LBB2_15: ## %cond.load25
-; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rdi), %xmm3 ## xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: expandload_v8f64_v8i1:
@@ -393,53 +393,53 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX1-NEXT: LBB2_16: ## %else26
; AVX1-NEXT: retq
; AVX1-NEXT: LBB2_1: ## %cond.load
-; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm2 ## xmm2 = mem[0],zero
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB2_4
; AVX1-NEXT: LBB2_3: ## %cond.load1
-; AVX1-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm0, %xmm2 ## xmm2 = xmm0[0,1],mem[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB2_6
; AVX1-NEXT: LBB2_5: ## %cond.load5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB2_8
; AVX1-NEXT: LBB2_7: ## %cond.load9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB2_10
; AVX1-NEXT: LBB2_9: ## %cond.load13
-; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm2 ## xmm2 = mem[0],zero
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je LBB2_12
; AVX1-NEXT: LBB2_11: ## %cond.load17
-; AVX1-NEXT: vmovhps {{.*#+}} xmm2 = xmm1[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm2 ## xmm2 = xmm1[0,1],mem[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB2_14
; AVX1-NEXT: LBB2_13: ## %cond.load21
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; AVX1-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB2_16
; AVX1-NEXT: LBB2_15: ## %cond.load25
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
@@ -474,53 +474,53 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX2-NEXT: LBB2_16: ## %else26
; AVX2-NEXT: retq
; AVX2-NEXT: LBB2_1: ## %cond.load
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vmovq (%rdi), %xmm2 ## xmm2 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB2_4
; AVX2-NEXT: LBB2_3: ## %cond.load1
-; AVX2-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm0, %xmm2 ## xmm2 = xmm0[0,1],mem[0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB2_6
; AVX2-NEXT: LBB2_5: ## %cond.load5
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB2_8
; AVX2-NEXT: LBB2_7: ## %cond.load9
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB2_10
; AVX2-NEXT: LBB2_9: ## %cond.load13
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vmovq (%rdi), %xmm2 ## xmm2 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je LBB2_12
; AVX2-NEXT: LBB2_11: ## %cond.load17
-; AVX2-NEXT: vmovhps {{.*#+}} xmm2 = xmm1[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm1, %xmm2 ## xmm2 = xmm1[0,1],mem[0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB2_14
; AVX2-NEXT: LBB2_13: ## %cond.load21
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; AVX2-NEXT: vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB2_16
; AVX2-NEXT: LBB2_15: ## %cond.load25
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
@@ -614,7 +614,7 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; SSE-NEXT: testl $32768, %ecx ## imm = 0x8000
; SSE-NEXT: je LBB3_32
; SSE-NEXT: LBB3_31: ## %cond.load57
-; SSE-NEXT: movhps {{.*#+}} xmm7 = xmm7[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm7 ## xmm7 = xmm7[0,1],mem[0,1]
; SSE-NEXT: LBB3_32: ## %else58
; SSE-NEXT: movaps %xmm0, (%rax)
; SSE-NEXT: movaps %xmm1, 16(%rax)
@@ -626,77 +626,77 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; SSE-NEXT: movaps %xmm7, 112(%rax)
; SSE-NEXT: retq
; SSE-NEXT: LBB3_1: ## %cond.load
-; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm0 ## xmm0 = mem[0,1],xmm0[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $2, %cl
; SSE-NEXT: je LBB3_4
; SSE-NEXT: LBB3_3: ## %cond.load1
-; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm0 ## xmm0 = xmm0[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $4, %cl
; SSE-NEXT: je LBB3_6
; SSE-NEXT: LBB3_5: ## %cond.load5
-; SSE-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm1 ## xmm1 = mem[0,1],xmm1[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $8, %cl
; SSE-NEXT: je LBB3_8
; SSE-NEXT: LBB3_7: ## %cond.load9
-; SSE-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $16, %cl
; SSE-NEXT: je LBB3_10
; SSE-NEXT: LBB3_9: ## %cond.load13
-; SSE-NEXT: movlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $32, %cl
; SSE-NEXT: je LBB3_12
; SSE-NEXT: LBB3_11: ## %cond.load17
-; SSE-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: je LBB3_14
; SSE-NEXT: LBB3_13: ## %cond.load21
-; SSE-NEXT: movlps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm3 ## xmm3 = mem[0,1],xmm3[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testb $-128, %cl
; SSE-NEXT: je LBB3_16
; SSE-NEXT: LBB3_15: ## %cond.load25
-; SSE-NEXT: movhps {{.*#+}} xmm3 = xmm3[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm3 ## xmm3 = xmm3[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $256, %ecx ## imm = 0x100
; SSE-NEXT: je LBB3_18
; SSE-NEXT: LBB3_17: ## %cond.load29
-; SSE-NEXT: movlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $512, %ecx ## imm = 0x200
; SSE-NEXT: je LBB3_20
; SSE-NEXT: LBB3_19: ## %cond.load33
-; SSE-NEXT: movhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $1024, %ecx ## imm = 0x400
; SSE-NEXT: je LBB3_22
; SSE-NEXT: LBB3_21: ## %cond.load37
-; SSE-NEXT: movlps {{.*#+}} xmm5 = mem[0,1],xmm5[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm5 ## xmm5 = mem[0,1],xmm5[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $2048, %ecx ## imm = 0x800
; SSE-NEXT: je LBB3_24
; SSE-NEXT: LBB3_23: ## %cond.load41
-; SSE-NEXT: movhps {{.*#+}} xmm5 = xmm5[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm5 ## xmm5 = xmm5[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $4096, %ecx ## imm = 0x1000
; SSE-NEXT: je LBB3_26
; SSE-NEXT: LBB3_25: ## %cond.load45
-; SSE-NEXT: movlps {{.*#+}} xmm6 = mem[0,1],xmm6[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm6 ## xmm6 = mem[0,1],xmm6[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $8192, %ecx ## imm = 0x2000
; SSE-NEXT: je LBB3_28
; SSE-NEXT: LBB3_27: ## %cond.load49
-; SSE-NEXT: movhps {{.*#+}} xmm6 = xmm6[0,1],mem[0,1]
+; SSE-NEXT: movhps (%rsi), %xmm6 ## xmm6 = xmm6[0,1],mem[0,1]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $16384, %ecx ## imm = 0x4000
; SSE-NEXT: je LBB3_30
; SSE-NEXT: LBB3_29: ## %cond.load53
-; SSE-NEXT: movlps {{.*#+}} xmm7 = mem[0,1],xmm7[2,3]
+; SSE-NEXT: movlps (%rsi), %xmm7 ## xmm7 = mem[0,1],xmm7[2,3]
; SSE-NEXT: addq $8, %rsi
; SSE-NEXT: testl $32768, %ecx ## imm = 0x8000
; SSE-NEXT: jne LBB3_31
@@ -765,105 +765,105 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX1-NEXT: LBB3_32: ## %else58
; AVX1-NEXT: retq
; AVX1-NEXT: LBB3_1: ## %cond.load
-; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB3_4
; AVX1-NEXT: LBB3_3: ## %cond.load1
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm0[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB3_6
; AVX1-NEXT: LBB3_5: ## %cond.load5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB3_8
; AVX1-NEXT: LBB3_7: ## %cond.load9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB3_10
; AVX1-NEXT: LBB3_9: ## %cond.load13
-; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je LBB3_12
; AVX1-NEXT: LBB3_11: ## %cond.load17
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm1[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB3_14
; AVX1-NEXT: LBB3_13: ## %cond.load21
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB3_16
; AVX1-NEXT: LBB3_15: ## %cond.load25
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: je LBB3_18
; AVX1-NEXT: LBB3_17: ## %cond.load29
-; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: je LBB3_20
; AVX1-NEXT: LBB3_19: ## %cond.load33
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm2[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: je LBB3_22
; AVX1-NEXT: LBB3_21: ## %cond.load37
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: je LBB3_24
; AVX1-NEXT: LBB3_23: ## %cond.load41
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: je LBB3_26
; AVX1-NEXT: LBB3_25: ## %cond.load45
-; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT: vmovsd (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX1-NEXT: je LBB3_28
; AVX1-NEXT: LBB3_27: ## %cond.load49
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm3[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: je LBB3_30
; AVX1-NEXT: LBB3_29: ## %cond.load53
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX1-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: addq $8, %rdi
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: je LBB3_32
; AVX1-NEXT: LBB3_31: ## %cond.load57
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX1-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: retq
;
@@ -927,105 +927,105 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX2-NEXT: LBB3_32: ## %else58
; AVX2-NEXT: retq
; AVX2-NEXT: LBB3_1: ## %cond.load
-; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX2-NEXT: vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB3_4
; AVX2-NEXT: LBB3_3: ## %cond.load1
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm0[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB3_6
; AVX2-NEXT: LBB3_5: ## %cond.load5
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB3_8
; AVX2-NEXT: LBB3_7: ## %cond.load9
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB3_10
; AVX2-NEXT: LBB3_9: ## %cond.load13
-; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX2-NEXT: vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $32, %al
; AVX2-NEXT: je LBB3_12
; AVX2-NEXT: LBB3_11: ## %cond.load17
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm1[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB3_14
; AVX2-NEXT: LBB3_13: ## %cond.load21
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB3_16
; AVX2-NEXT: LBB3_15: ## %cond.load25
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB3_18
; AVX2-NEXT: LBB3_17: ## %cond.load29
-; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX2-NEXT: vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: je LBB3_20
; AVX2-NEXT: LBB3_19: ## %cond.load33
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm2[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: je LBB3_22
; AVX2-NEXT: LBB3_21: ## %cond.load37
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: je LBB3_24
; AVX2-NEXT: LBB3_23: ## %cond.load41
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB3_26
; AVX2-NEXT: LBB3_25: ## %cond.load45
-; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX2-NEXT: vmovq (%rdi), %xmm4 ## xmm4 = mem[0],zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
; AVX2-NEXT: je LBB3_28
; AVX2-NEXT: LBB3_27: ## %cond.load49
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm3[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB3_30
; AVX2-NEXT: LBB3_29: ## %cond.load53
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vmovlps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB3_32
; AVX2-NEXT: LBB3_31: ## %cond.load57
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vmovhps {{.*#+}} xmm4 = xmm4[0,1],mem[0,1]
+; AVX2-NEXT: vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: retq
;
@@ -1129,13 +1129,13 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
; SSE2-NEXT: LBB4_4: ## %else2
; SSE2-NEXT: retq
; SSE2-NEXT: LBB4_1: ## %cond.load
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: addq $4, %rdi
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB4_4
; SSE2-NEXT: LBB4_3: ## %cond.load1
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
@@ -1155,13 +1155,13 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
; SSE42-NEXT: LBB4_4: ## %else2
; SSE42-NEXT: retq
; SSE42-NEXT: LBB4_1: ## %cond.load
-; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE42-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE42-NEXT: addq $4, %rdi
; SSE42-NEXT: testb $2, %al
; SSE42-NEXT: je LBB4_4
; SSE42-NEXT: LBB4_3: ## %cond.load1
-; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE42-NEXT: insertps $16, (%rdi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: expandload_v2f32_v2i1:
@@ -1178,13 +1178,13 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
; AVX1OR2-NEXT: LBB4_4: ## %else2
; AVX1OR2-NEXT: retq
; AVX1OR2-NEXT: LBB4_1: ## %cond.load
-; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1OR2-NEXT: addq $4, %rdi
; AVX1OR2-NEXT: testb $2, %al
; AVX1OR2-NEXT: je LBB4_4
; AVX1OR2-NEXT: LBB4_3: ## %cond.load1
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1OR2-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: expandload_v2f32_v2i1:
@@ -1222,10 +1222,10 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
; SSE2-LABEL: expandload_v4f32_const:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[0,0]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
@@ -1233,18 +1233,18 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
;
; SSE42-LABEL: expandload_v4f32_const:
; SSE42: ## %bb.0:
-; SSE42-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE42-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE42-NEXT: insertps $16, 4(%rdi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE42-NEXT: insertps $32, 8(%rdi), %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: expandload_v4f32_const:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1OR2-NEXT: vinsertps $16, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1OR2-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: expandload_v4f32_const:
@@ -1279,16 +1279,16 @@ define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
; SSE2: ## %bb.0:
; SSE2-NEXT: movups (%rdi), %xmm0
; SSE2-NEXT: movups 16(%rdi), %xmm1
-; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 32(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 36(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm5[0,0]
-; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 40(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[0,2]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 44(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 48(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm2[0,0]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss 52(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm2[0,2]
; SSE2-NEXT: movaps %xmm4, %xmm2
@@ -1299,32 +1299,32 @@ define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
; SSE42: ## %bb.0:
; SSE42-NEXT: movups (%rdi), %xmm0
; SSE42-NEXT: movups 16(%rdi), %xmm1
-; SSE42-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE42-NEXT: movss 32(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; SSE42-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE42-NEXT: insertps $16, 36(%rdi), %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; SSE42-NEXT: insertps $32, 40(%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; SSE42-NEXT: movss 44(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE42-NEXT: insertps $16, 48(%rdi), %xmm3 ## xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE42-NEXT: insertps $32, 52(%rdi), %xmm3 ## xmm3 = xmm3[0,1],mem[0],xmm3[3]
; SSE42-NEXT: retq
;
; AVX1OR2-LABEL: expandload_v16f32_const:
; AVX1OR2: ## %bb.0:
; AVX1OR2-NEXT: vmovups (%rdi), %xmm0
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; AVX1OR2-NEXT: vmovsd 16(%rdi), %xmm2 ## xmm2 = mem[0],zero
+; AVX1OR2-NEXT: vinsertps $32, 24(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1OR2-NEXT: vinsertps $48, 28(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1,2],mem[0]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1OR2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT: vmovss 32(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX1OR2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1OR2-NEXT: vinsertps $16, 36(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1OR2-NEXT: vmovss 44(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero
; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; AVX1OR2-NEXT: vinsertps $16, 48(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1OR2-NEXT: retq
;
@@ -1355,8 +1355,8 @@ define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
define <16 x float> @expandload_v16f32_const_undef(float* %base) {
; SSE2-LABEL: expandload_v16f32_const_undef:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT: movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE2-NEXT: movups (%rdi), %xmm0
; SSE2-NEXT: movups 16(%rdi), %xmm1
@@ -1365,8 +1365,8 @@ define <16 x float> @expandload_v16f32_const_undef(float* %base) {
;
; SSE42-LABEL: expandload_v16f32_const_undef:
; SSE42: ## %bb.0:
-; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; SSE42-NEXT: movsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero
+; SSE42-NEXT: insertps $32, 40(%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
; SSE42-NEXT: movups (%rdi), %xmm0
; SSE42-NEXT: movups 16(%rdi), %xmm1
; SSE42-NEXT: movups 44(%rdi), %xmm3
@@ -1374,8 +1374,8 @@ define <16 x float> @expandload_v16f32_const_undef(float* %base) {
;
; AVX1OR2-LABEL: expandload_v16f32_const_undef:
; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm0 ## xmm0 = mem[0],zero
+; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX1OR2-NEXT: vinsertf128 $1, 44(%rdi), %ymm0, %ymm1
; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
; AVX1OR2-NEXT: retq
@@ -1531,7 +1531,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $-2147483648, %ecx ## imm = 0x80000000
; SSE2-NEXT: je LBB8_64
; SSE2-NEXT: LBB8_63: ## %cond.load121
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,0]
; SSE2-NEXT: LBB8_64: ## %else122
@@ -1545,13 +1545,13 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: movaps %xmm7, 112(%rax)
; SSE2-NEXT: retq
; SSE2-NEXT: LBB8_1: ## %cond.load
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testb $2, %cl
; SSE2-NEXT: je LBB8_4
; SSE2-NEXT: LBB8_3: ## %cond.load1
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm0[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1559,27 +1559,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testb $4, %cl
; SSE2-NEXT: je LBB8_6
; SSE2-NEXT: LBB8_5: ## %cond.load5
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testb $8, %cl
; SSE2-NEXT: je LBB8_8
; SSE2-NEXT: LBB8_7: ## %cond.load9
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testb $16, %cl
; SSE2-NEXT: je LBB8_10
; SSE2-NEXT: LBB8_9: ## %cond.load13
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm8[0],xmm1[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testb $32, %cl
; SSE2-NEXT: je LBB8_12
; SSE2-NEXT: LBB8_11: ## %cond.load17
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm1[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1587,27 +1587,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testb $64, %cl
; SSE2-NEXT: je LBB8_14
; SSE2-NEXT: LBB8_13: ## %cond.load21
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testb $-128, %cl
; SSE2-NEXT: je LBB8_16
; SSE2-NEXT: LBB8_15: ## %cond.load25
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $256, %ecx ## imm = 0x100
; SSE2-NEXT: je LBB8_18
; SSE2-NEXT: LBB8_17: ## %cond.load29
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm8[0],xmm2[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $512, %ecx ## imm = 0x200
; SSE2-NEXT: je LBB8_20
; SSE2-NEXT: LBB8_19: ## %cond.load33
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1615,27 +1615,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $1024, %ecx ## imm = 0x400
; SSE2-NEXT: je LBB8_22
; SSE2-NEXT: LBB8_21: ## %cond.load37
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $2048, %ecx ## imm = 0x800
; SSE2-NEXT: je LBB8_24
; SSE2-NEXT: LBB8_23: ## %cond.load41
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000
; SSE2-NEXT: je LBB8_26
; SSE2-NEXT: LBB8_25: ## %cond.load45
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm8[0],xmm3[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $8192, %ecx ## imm = 0x2000
; SSE2-NEXT: je LBB8_28
; SSE2-NEXT: LBB8_27: ## %cond.load49
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm3[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1643,27 +1643,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $16384, %ecx ## imm = 0x4000
; SSE2-NEXT: je LBB8_30
; SSE2-NEXT: LBB8_29: ## %cond.load53
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $32768, %ecx ## imm = 0x8000
; SSE2-NEXT: je LBB8_32
; SSE2-NEXT: LBB8_31: ## %cond.load57
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm3[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000
; SSE2-NEXT: je LBB8_34
; SSE2-NEXT: LBB8_33: ## %cond.load61
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm8[0],xmm4[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $131072, %ecx ## imm = 0x20000
; SSE2-NEXT: je LBB8_36
; SSE2-NEXT: LBB8_35: ## %cond.load65
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1671,27 +1671,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $262144, %ecx ## imm = 0x40000
; SSE2-NEXT: je LBB8_38
; SSE2-NEXT: LBB8_37: ## %cond.load69
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $524288, %ecx ## imm = 0x80000
; SSE2-NEXT: je LBB8_40
; SSE2-NEXT: LBB8_39: ## %cond.load73
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm4[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000
; SSE2-NEXT: je LBB8_42
; SSE2-NEXT: LBB8_41: ## %cond.load77
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $2097152, %ecx ## imm = 0x200000
; SSE2-NEXT: je LBB8_44
; SSE2-NEXT: LBB8_43: ## %cond.load81
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm5[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1699,27 +1699,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $4194304, %ecx ## imm = 0x400000
; SSE2-NEXT: je LBB8_46
; SSE2-NEXT: LBB8_45: ## %cond.load85
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $8388608, %ecx ## imm = 0x800000
; SSE2-NEXT: je LBB8_48
; SSE2-NEXT: LBB8_47: ## %cond.load89
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm5[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $16777216, %ecx ## imm = 0x1000000
; SSE2-NEXT: je LBB8_50
; SSE2-NEXT: LBB8_49: ## %cond.load93
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $33554432, %ecx ## imm = 0x2000000
; SSE2-NEXT: je LBB8_52
; SSE2-NEXT: LBB8_51: ## %cond.load97
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm6[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1727,27 +1727,27 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $67108864, %ecx ## imm = 0x4000000
; SSE2-NEXT: je LBB8_54
; SSE2-NEXT: LBB8_53: ## %cond.load101
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $134217728, %ecx ## imm = 0x8000000
; SSE2-NEXT: je LBB8_56
; SSE2-NEXT: LBB8_55: ## %cond.load105
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm6[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,0]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $268435456, %ecx ## imm = 0x10000000
; SSE2-NEXT: je LBB8_58
; SSE2-NEXT: LBB8_57: ## %cond.load109
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm7 = xmm8[0],xmm7[1,2,3]
; SSE2-NEXT: addq $4, %rsi
; SSE2-NEXT: testl $536870912, %ecx ## imm = 0x20000000
; SSE2-NEXT: je LBB8_60
; SSE2-NEXT: LBB8_59: ## %cond.load113
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[2,3]
; SSE2-NEXT: addq $4, %rsi
@@ -1755,7 +1755,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE2-NEXT: testl $1073741824, %ecx ## imm = 0x40000000
; SSE2-NEXT: je LBB8_62
; SSE2-NEXT: LBB8_61: ## %cond.load117
-; SSE2-NEXT: movss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,0],xmm7[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
; SSE2-NEXT: addq $4, %rsi
@@ -1888,7 +1888,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE42-NEXT: testl $-2147483648, %ecx ## imm = 0x80000000
; SSE42-NEXT: je LBB8_64
; SSE42-NEXT: LBB8_63: ## %cond.load121
-; SSE42-NEXT: insertps {{.*#+}} xmm7 = xmm7[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm7 ## xmm7 = xmm7[0,1,2],mem[0]
; SSE42-NEXT: LBB8_64: ## %else122
; SSE42-NEXT: movaps %xmm0, (%rax)
; SSE42-NEXT: movaps %xmm1, 16(%rax)
@@ -1900,165 +1900,165 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; SSE42-NEXT: movaps %xmm7, 112(%rax)
; SSE42-NEXT: retq
; SSE42-NEXT: LBB8_1: ## %cond.load
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $2, %cl
; SSE42-NEXT: je LBB8_4
; SSE42-NEXT: LBB8_3: ## %cond.load1
-; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm0 ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $4, %cl
; SSE42-NEXT: je LBB8_6
; SSE42-NEXT: LBB8_5: ## %cond.load5
-; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $8, %cl
; SSE42-NEXT: je LBB8_8
; SSE42-NEXT: LBB8_7: ## %cond.load9
-; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $16, %cl
; SSE42-NEXT: je LBB8_10
; SSE42-NEXT: LBB8_9: ## %cond.load13
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $32, %cl
; SSE42-NEXT: je LBB8_12
; SSE42-NEXT: LBB8_11: ## %cond.load17
-; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm1 ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $64, %cl
; SSE42-NEXT: je LBB8_14
; SSE42-NEXT: LBB8_13: ## %cond.load21
-; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testb $-128, %cl
; SSE42-NEXT: je LBB8_16
; SSE42-NEXT: LBB8_15: ## %cond.load25
-; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm1 ## xmm1 = xmm1[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $256, %ecx ## imm = 0x100
; SSE42-NEXT: je LBB8_18
; SSE42-NEXT: LBB8_17: ## %cond.load29
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $512, %ecx ## imm = 0x200
; SSE42-NEXT: je LBB8_20
; SSE42-NEXT: LBB8_19: ## %cond.load33
-; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm2 ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $1024, %ecx ## imm = 0x400
; SSE42-NEXT: je LBB8_22
; SSE42-NEXT: LBB8_21: ## %cond.load37
-; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $2048, %ecx ## imm = 0x800
; SSE42-NEXT: je LBB8_24
; SSE42-NEXT: LBB8_23: ## %cond.load41
-; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm2 ## xmm2 = xmm2[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $4096, %ecx ## imm = 0x1000
; SSE42-NEXT: je LBB8_26
; SSE42-NEXT: LBB8_25: ## %cond.load45
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $8192, %ecx ## imm = 0x2000
; SSE42-NEXT: je LBB8_28
; SSE42-NEXT: LBB8_27: ## %cond.load49
-; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm3 ## xmm3 = xmm3[0],mem[0],xmm3[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $16384, %ecx ## imm = 0x4000
; SSE42-NEXT: je LBB8_30
; SSE42-NEXT: LBB8_29: ## %cond.load53
-; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm3 ## xmm3 = xmm3[0,1],mem[0],xmm3[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $32768, %ecx ## imm = 0x8000
; SSE42-NEXT: je LBB8_32
; SSE42-NEXT: LBB8_31: ## %cond.load57
-; SSE42-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm3 ## xmm3 = xmm3[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $65536, %ecx ## imm = 0x10000
; SSE42-NEXT: je LBB8_34
; SSE42-NEXT: LBB8_33: ## %cond.load61
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $131072, %ecx ## imm = 0x20000
; SSE42-NEXT: je LBB8_36
; SSE42-NEXT: LBB8_35: ## %cond.load65
-; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $262144, %ecx ## imm = 0x40000
; SSE42-NEXT: je LBB8_38
; SSE42-NEXT: LBB8_37: ## %cond.load69
-; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $524288, %ecx ## imm = 0x80000
; SSE42-NEXT: je LBB8_40
; SSE42-NEXT: LBB8_39: ## %cond.load73
-; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $1048576, %ecx ## imm = 0x100000
; SSE42-NEXT: je LBB8_42
; SSE42-NEXT: LBB8_41: ## %cond.load77
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $2097152, %ecx ## imm = 0x200000
; SSE42-NEXT: je LBB8_44
; SSE42-NEXT: LBB8_43: ## %cond.load81
-; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0],mem[0],xmm5[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm5 ## xmm5 = xmm5[0],mem[0],xmm5[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $4194304, %ecx ## imm = 0x400000
; SSE42-NEXT: je LBB8_46
; SSE42-NEXT: LBB8_45: ## %cond.load85
-; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1],mem[0],xmm5[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm5 ## xmm5 = xmm5[0,1],mem[0],xmm5[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $8388608, %ecx ## imm = 0x800000
; SSE42-NEXT: je LBB8_48
; SSE42-NEXT: LBB8_47: ## %cond.load89
-; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm5 ## xmm5 = xmm5[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $16777216, %ecx ## imm = 0x1000000
; SSE42-NEXT: je LBB8_50
; SSE42-NEXT: LBB8_49: ## %cond.load93
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $33554432, %ecx ## imm = 0x2000000
; SSE42-NEXT: je LBB8_52
; SSE42-NEXT: LBB8_51: ## %cond.load97
-; SSE42-NEXT: insertps {{.*#+}} xmm6 = xmm6[0],mem[0],xmm6[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm6 ## xmm6 = xmm6[0],mem[0],xmm6[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $67108864, %ecx ## imm = 0x4000000
; SSE42-NEXT: je LBB8_54
; SSE42-NEXT: LBB8_53: ## %cond.load101
-; SSE42-NEXT: insertps {{.*#+}} xmm6 = xmm6[0,1],mem[0],xmm6[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm6 ## xmm6 = xmm6[0,1],mem[0],xmm6[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $134217728, %ecx ## imm = 0x8000000
; SSE42-NEXT: je LBB8_56
; SSE42-NEXT: LBB8_55: ## %cond.load105
-; SSE42-NEXT: insertps {{.*#+}} xmm6 = xmm6[0,1,2],mem[0]
+; SSE42-NEXT: insertps $48, (%rsi), %xmm6 ## xmm6 = xmm6[0,1,2],mem[0]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $268435456, %ecx ## imm = 0x10000000
; SSE42-NEXT: je LBB8_58
; SSE42-NEXT: LBB8_57: ## %cond.load109
-; SSE42-NEXT: movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE42-NEXT: movd (%rsi), %xmm8 ## xmm8 = mem[0],zero,zero,zero
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3,4,5,6,7]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $536870912, %ecx ## imm = 0x20000000
; SSE42-NEXT: je LBB8_60
; SSE42-NEXT: LBB8_59: ## %cond.load113
-; SSE42-NEXT: insertps {{.*#+}} xmm7 = xmm7[0],mem[0],xmm7[2,3]
+; SSE42-NEXT: insertps $16, (%rsi), %xmm7 ## xmm7 = xmm7[0],mem[0],xmm7[2,3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $1073741824, %ecx ## imm = 0x40000000
; SSE42-NEXT: je LBB8_62
; SSE42-NEXT: LBB8_61: ## %cond.load117
-; SSE42-NEXT: insertps {{.*#+}} xmm7 = xmm7[0,1],mem[0],xmm7[3]
+; SSE42-NEXT: insertps $32, (%rsi), %xmm7 ## xmm7 = xmm7[0,1],mem[0],xmm7[3]
; SSE42-NEXT: addq $4, %rsi
; SSE42-NEXT: testl $-2147483648, %ecx ## imm = 0x80000000
; SSE42-NEXT: jne LBB8_63
@@ -2187,31 +2187,31 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: LBB8_64: ## %else122
; AVX1-NEXT: retq
; AVX1-NEXT: LBB8_1: ## %cond.load
-; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je LBB8_4
; AVX1-NEXT: LBB8_3: ## %cond.load1
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0],mem[0],xmm0[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je LBB8_6
; AVX1-NEXT: LBB8_5: ## %cond.load5
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0],xmm0[3]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je LBB8_8
; AVX1-NEXT: LBB8_7: ## %cond.load9
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1,2],mem[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je LBB8_10
; AVX1-NEXT: LBB8_9: ## %cond.load13
-; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
@@ -2220,51 +2220,51 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: je LBB8_12
; AVX1-NEXT: LBB8_11: ## %cond.load17
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je LBB8_14
; AVX1-NEXT: LBB8_13: ## %cond.load21
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je LBB8_16
; AVX1-NEXT: LBB8_15: ## %cond.load25
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $256, %eax ## imm = 0x100
; AVX1-NEXT: je LBB8_18
; AVX1-NEXT: LBB8_17: ## %cond.load29
-; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $512, %eax ## imm = 0x200
; AVX1-NEXT: je LBB8_20
; AVX1-NEXT: LBB8_19: ## %cond.load33
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0],mem[0],xmm1[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0],mem[0],xmm1[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
; AVX1-NEXT: je LBB8_22
; AVX1-NEXT: LBB8_21: ## %cond.load37
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0],xmm1[3]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
; AVX1-NEXT: je LBB8_24
; AVX1-NEXT: LBB8_23: ## %cond.load41
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1,2],mem[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX1-NEXT: je LBB8_26
; AVX1-NEXT: LBB8_25: ## %cond.load45
-; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
@@ -2273,51 +2273,51 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: je LBB8_28
; AVX1-NEXT: LBB8_27: ## %cond.load49
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX1-NEXT: je LBB8_30
; AVX1-NEXT: LBB8_29: ## %cond.load53
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX1-NEXT: je LBB8_32
; AVX1-NEXT: LBB8_31: ## %cond.load57
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX1-NEXT: je LBB8_34
; AVX1-NEXT: LBB8_33: ## %cond.load61
-; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX1-NEXT: je LBB8_36
; AVX1-NEXT: LBB8_35: ## %cond.load65
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0],mem[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0],mem[0],xmm2[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX1-NEXT: je LBB8_38
; AVX1-NEXT: LBB8_37: ## %cond.load69
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1],mem[0],xmm2[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0],xmm2[3]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX1-NEXT: je LBB8_40
; AVX1-NEXT: LBB8_39: ## %cond.load73
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1,2],mem[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX1-NEXT: je LBB8_42
; AVX1-NEXT: LBB8_41: ## %cond.load77
-; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
@@ -2326,51 +2326,51 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: je LBB8_44
; AVX1-NEXT: LBB8_43: ## %cond.load81
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX1-NEXT: je LBB8_46
; AVX1-NEXT: LBB8_45: ## %cond.load85
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX1-NEXT: je LBB8_48
; AVX1-NEXT: LBB8_47: ## %cond.load89
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX1-NEXT: je LBB8_50
; AVX1-NEXT: LBB8_49: ## %cond.load93
-; AVX1-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX1-NEXT: je LBB8_52
; AVX1-NEXT: LBB8_51: ## %cond.load97
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0],mem[0],xmm3[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0],mem[0],xmm3[2,3]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX1-NEXT: je LBB8_54
; AVX1-NEXT: LBB8_53: ## %cond.load101
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0,1],mem[0],xmm3[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0],xmm3[3]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX1-NEXT: je LBB8_56
; AVX1-NEXT: LBB8_55: ## %cond.load105
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1,2],mem[0]
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX1-NEXT: je LBB8_58
; AVX1-NEXT: LBB8_57: ## %cond.load109
-; AVX1-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
@@ -2379,21 +2379,21 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX1-NEXT: je LBB8_60
; AVX1-NEXT: LBB8_59: ## %cond.load113
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX1-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX1-NEXT: je LBB8_62
; AVX1-NEXT: LBB8_61: ## %cond.load117
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX1-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: addq $4, %rdi
; AVX1-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX1-NEXT: je LBB8_64
; AVX1-NEXT: LBB8_63: ## %cond.load121
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX1-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: retq
;
@@ -2509,31 +2509,31 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: LBB8_64: ## %else122
; AVX2-NEXT: retq
; AVX2-NEXT: LBB8_1: ## %cond.load
-; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $2, %al
; AVX2-NEXT: je LBB8_4
; AVX2-NEXT: LBB8_3: ## %cond.load1
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[0],mem[0],xmm0[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0],mem[0],xmm0[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $4, %al
; AVX2-NEXT: je LBB8_6
; AVX2-NEXT: LBB8_5: ## %cond.load5
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[0,1],mem[0],xmm0[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1],mem[0],xmm0[3]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB8_8
; AVX2-NEXT: LBB8_7: ## %cond.load9
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm0[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm4 ## xmm4 = xmm0[0,1,2],mem[0]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB8_10
; AVX2-NEXT: LBB8_9: ## %cond.load13
-; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
@@ -2542,51 +2542,51 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: je LBB8_12
; AVX2-NEXT: LBB8_11: ## %cond.load17
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $64, %al
; AVX2-NEXT: je LBB8_14
; AVX2-NEXT: LBB8_13: ## %cond.load21
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB8_16
; AVX2-NEXT: LBB8_15: ## %cond.load25
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $256, %eax ## imm = 0x100
; AVX2-NEXT: je LBB8_18
; AVX2-NEXT: LBB8_17: ## %cond.load29
-; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $512, %eax ## imm = 0x200
; AVX2-NEXT: je LBB8_20
; AVX2-NEXT: LBB8_19: ## %cond.load33
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0],mem[0],xmm1[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0],mem[0],xmm1[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
; AVX2-NEXT: je LBB8_22
; AVX2-NEXT: LBB8_21: ## %cond.load37
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1],mem[0],xmm1[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1],mem[0],xmm1[3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
; AVX2-NEXT: je LBB8_24
; AVX2-NEXT: LBB8_23: ## %cond.load41
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm1[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm1, %xmm4 ## xmm4 = xmm1[0,1,2],mem[0]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
; AVX2-NEXT: je LBB8_26
; AVX2-NEXT: LBB8_25: ## %cond.load45
-; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
@@ -2595,51 +2595,51 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: je LBB8_28
; AVX2-NEXT: LBB8_27: ## %cond.load49
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
; AVX2-NEXT: je LBB8_30
; AVX2-NEXT: LBB8_29: ## %cond.load53
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
; AVX2-NEXT: je LBB8_32
; AVX2-NEXT: LBB8_31: ## %cond.load57
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $65536, %eax ## imm = 0x10000
; AVX2-NEXT: je LBB8_34
; AVX2-NEXT: LBB8_33: ## %cond.load61
-; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $131072, %eax ## imm = 0x20000
; AVX2-NEXT: je LBB8_36
; AVX2-NEXT: LBB8_35: ## %cond.load65
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0],mem[0],xmm2[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0],mem[0],xmm2[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $262144, %eax ## imm = 0x40000
; AVX2-NEXT: je LBB8_38
; AVX2-NEXT: LBB8_37: ## %cond.load69
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1],mem[0],xmm2[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1],mem[0],xmm2[3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $524288, %eax ## imm = 0x80000
; AVX2-NEXT: je LBB8_40
; AVX2-NEXT: LBB8_39: ## %cond.load73
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm2, %xmm4 ## xmm4 = xmm2[0,1,2],mem[0]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $1048576, %eax ## imm = 0x100000
; AVX2-NEXT: je LBB8_42
; AVX2-NEXT: LBB8_41: ## %cond.load77
-; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
@@ -2648,51 +2648,51 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: je LBB8_44
; AVX2-NEXT: LBB8_43: ## %cond.load81
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $4194304, %eax ## imm = 0x400000
; AVX2-NEXT: je LBB8_46
; AVX2-NEXT: LBB8_45: ## %cond.load85
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $8388608, %eax ## imm = 0x800000
; AVX2-NEXT: je LBB8_48
; AVX2-NEXT: LBB8_47: ## %cond.load89
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $16777216, %eax ## imm = 0x1000000
; AVX2-NEXT: je LBB8_50
; AVX2-NEXT: LBB8_49: ## %cond.load93
-; AVX2-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $33554432, %eax ## imm = 0x2000000
; AVX2-NEXT: je LBB8_52
; AVX2-NEXT: LBB8_51: ## %cond.load97
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0],mem[0],xmm3[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0],mem[0],xmm3[2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $67108864, %eax ## imm = 0x4000000
; AVX2-NEXT: je LBB8_54
; AVX2-NEXT: LBB8_53: ## %cond.load101
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0,1],mem[0],xmm3[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1],mem[0],xmm3[3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $134217728, %eax ## imm = 0x8000000
; AVX2-NEXT: je LBB8_56
; AVX2-NEXT: LBB8_55: ## %cond.load105
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm3[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm3, %xmm4 ## xmm4 = xmm3[0,1,2],mem[0]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $268435456, %eax ## imm = 0x10000000
; AVX2-NEXT: je LBB8_58
; AVX2-NEXT: LBB8_57: ## %cond.load109
-; AVX2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX2-NEXT: vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
@@ -2701,21 +2701,21 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
; AVX2-NEXT: je LBB8_60
; AVX2-NEXT: LBB8_59: ## %cond.load113
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; AVX2-NEXT: vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $1073741824, %eax ## imm = 0x40000000
; AVX2-NEXT: je LBB8_62
; AVX2-NEXT: LBB8_61: ## %cond.load117
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; AVX2-NEXT: vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: addq $4, %rdi
; AVX2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000
; AVX2-NEXT: je LBB8_64
; AVX2-NEXT: LBB8_63: ## %cond.load121
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],mem[0]
+; AVX2-NEXT: vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: retq
;
@@ -2754,7 +2754,7 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
define <2 x i64> @expandload_v2i64_const(i64* %base, <2 x i64> %src0) {
; SSE2-LABEL: expandload_v2i64_const:
; SSE2: ## %bb.0:
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
@@ -2819,13 +2819,13 @@ define <4 x i32> @expandload_v4i32_v4i32(i32* %base, <4 x i32> %src0, <4 x i32>
; SSE2-NEXT: LBB10_8: ## %else10
; SSE2-NEXT: retq
; SSE2-NEXT: LBB10_1: ## %cond.load
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: addq $4, %rdi
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je LBB10_4
; SSE2-NEXT: LBB10_3: ## %cond.load1
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: addq $4, %rdi
@@ -2833,14 +2833,14 @@ define <4 x i32> @expandload_v4i32_v4i32(i32* %base, <4 x i32> %src0, <4 x i32>
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je LBB10_6
; SSE2-NEXT: LBB10_5: ## %cond.load5
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: addq $4, %rdi
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je LBB10_8
; SSE2-NEXT: LBB10_7: ## %cond.load9
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr39666.ll b/llvm/test/CodeGen/X86/pr39666.ll
index d2ef05922481..4bc16c48a097 100644
--- a/llvm/test/CodeGen/X86/pr39666.ll
+++ b/llvm/test/CodeGen/X86/pr39666.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s
define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
More information about the llvm-commits
mailing list