[llvm] a8ad917 - [X86] Fix handling of maskmovdqu in X32

Harald van Dijk via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 15 14:56:36 PDT 2021


Author: Harald van Dijk
Date: 2021-07-15T22:56:08+01:00
New Revision: a8ad9170543906fc58336ab736a109fb42082fbf

URL: https://github.com/llvm/llvm-project/commit/a8ad9170543906fc58336ab736a109fb42082fbf
DIFF: https://github.com/llvm/llvm-project/commit/a8ad9170543906fc58336ab736a109fb42082fbf.diff

LOG: [X86] Fix handling of maskmovdqu in X32

The maskmovdqu instruction is an odd one: it has a 32-bit and a 64-bit
variant, the former using EDI, the latter RDI, but the use of the
register is implicit. In 64-bit mode, a 0x67 prefix can be used to get
the version using EDI, but there is no way to express this in
assembly in a single instruction, the only way is with an explicit
addr32.

This change adds support for the instruction. When generating assembly
text, that explicit addr32 will be added. When not generating assembly
text, it will be kept as a single instruction and will be emitted with
that 0x67 prefix. When parsing assembly text, it will be re-parsed as
ADDR32 followed by MASKMOVDQU64, which still results in the correct
bytes when converted to machine code.

The same applies to vmaskmovdqu as well.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D103427

Added: 
    llvm/test/MC/X86/maskmovdqu.s
    llvm/test/MC/X86/maskmovdqu64.s

Modified: 
    llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
    llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/lib/Target/X86/X86ScheduleBtVer2.td
    llvm/test/CodeGen/X86/maskmovdqu.ll
    llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
    llvm/utils/TableGen/X86DisassemblerTables.cpp
    llvm/utils/TableGen/X86RecognizableInstr.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index db8206cf2e3d9..757a3c0c8a712 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -116,6 +116,8 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
   ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
   ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
+  ENUM_ENTRY(IC_64BIT_VEX_OPSIZE,        4, "requires 64-bit mode and VEX")         \
+  ENUM_ENTRY(IC_64BIT_VEX_OPSIZE_ADSIZE, 5, "requires 64-bit mode, VEX, and AdSize")\
   ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
   ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
   ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \

diff  --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4e6d8e8e1a54e..82581eb3c30a2 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -1119,6 +1119,8 @@ static int getInstructionID(struct InternalInstruction *insn,
       switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
+        if (insn->hasAdSize)
+          attrMask |= ATTR_ADSIZE;
         break;
       case VEX_PREFIX_F3:
         attrMask |= ATTR_XS;
@@ -1175,6 +1177,8 @@ static int getInstructionID(struct InternalInstruction *insn,
     case 0x66:
       if (insn->mode != MODE_16BIT)
         attrMask |= ATTR_OPSIZE;
+      if (insn->hasAdSize)
+        attrMask |= ATTR_ADSIZE;
       break;
     case 0x67:
       attrMask |= ATTR_ADSIZE;

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index ae40b712edf5a..41fda603d5a9f 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -4011,7 +4011,15 @@ def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
-           VEX, VEX_WIG;
+           VEX, VEX_WIG, AdSize64;
+let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in
+def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask), "",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+           VEX, VEX_WIG, AdSize32 {
+  let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}";
+  let AsmVariantName = "NonParsable";
+}
 
 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -4020,7 +4028,15 @@ def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
-           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
+           AdSize64;
+let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in
+def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "addr32 maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+           AdSize32 {
+  let AsmVariantName = "NonParsable";
+}
 
 } // ExeDomain = SSEPackedInt
 

diff  --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index ef156b573154f..cdd03830bcad3 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -835,8 +835,8 @@ def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JAL
   let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
   let NumMicroOps = 63;
 }
-def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
-                                         VMASKMOVDQU, VMASKMOVDQU64)>;
+def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, MASKMOVDQUX32,
+                                         VMASKMOVDQU, VMASKMOVDQU64, VMASKMOVDQUX32)>;
 
 ///////////////////////////////////////////////////////////////////////////////
 //  SchedWriteVariant definitions.

diff  --git a/llvm/test/CodeGen/X86/maskmovdqu.ll b/llvm/test/CodeGen/X86/maskmovdqu.ll
index a8443e44a16d6..898b0e9c54a25 100644
--- a/llvm/test/CodeGen/X86/maskmovdqu.ll
+++ b/llvm/test/CodeGen/X86/maskmovdqu.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686--    -mattr=+sse2,-avx | FileCheck %s --check-prefix=i686_SSE2
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2,-avx | FileCheck %s --check-prefix=x86_64_SSE2
+; RUN: llc < %s -mtriple=x86_64--gnux32 -mattr=+sse2,-avx | FileCheck %s --check-prefix=x86_x32_SSE2
 ; RUN: llc < %s -mtriple=i686--    -mattr=+avx | FileCheck %s --check-prefix=i686_AVX
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=x86_64_AVX
+; RUN: llc < %s -mtriple=x86_64--gnux32 -mattr=+avx | FileCheck %s --check-prefix=x86_x32_AVX
 ; rdar://6573467
 
 define void @test(<16 x i8> %a, <16 x i8> %b, i32 %dummy, i8* %c) nounwind {
@@ -20,6 +22,13 @@ define void @test(<16 x i8> %a, <16 x i8> %b, i32 %dummy, i8* %c) nounwind {
 ; x86_64_SSE2-NEXT:    maskmovdqu %xmm1, %xmm0
 ; x86_64_SSE2-NEXT:    retq
 ;
+; x86_x32_SSE2-LABEL: test:
+; x86_x32_SSE2:       # %bb.0: # %entry
+; x86_x32_SSE2-NEXT:    movq %rsi, %rdi
+; x86_x32_SSE2-NEXT:    # kill: def $edi killed $edi killed $rdi
+; x86_x32_SSE2-NEXT:    addr32 maskmovdqu %xmm1, %xmm0
+; x86_x32_SSE2-NEXT:    retq
+;
 ; i686_AVX-LABEL: test:
 ; i686_AVX:       # %bb.0: # %entry
 ; i686_AVX-NEXT:    pushl %edi
@@ -33,6 +42,12 @@ define void @test(<16 x i8> %a, <16 x i8> %b, i32 %dummy, i8* %c) nounwind {
 ; x86_64_AVX-NEXT:    movq %rsi, %rdi
 ; x86_64_AVX-NEXT:    vmaskmovdqu %xmm1, %xmm0
 ; x86_64_AVX-NEXT:    retq
+; x86_x32_AVX-LABEL: test:
+; x86_x32_AVX:       # %bb.0: # %entry
+; x86_x32_AVX-NEXT:    movq %rsi, %rdi
+; x86_x32_AVX-NEXT:    # kill: def $edi killed $edi killed $rdi
+; x86_x32_AVX-NEXT:    addr32 vmaskmovdqu %xmm1, %xmm0
+; x86_x32_AVX-NEXT:    retq
 entry:
 	tail call void @llvm.x86.sse2.maskmov.dqu( <16 x i8> %a, <16 x i8> %b, i8* %c )
 	ret void

diff  --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index e3051f669e18a..eaed72299bce2 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -5,6 +5,9 @@
 ; RUN: llc < %s -show-mc-encoding -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
 ; RUN: llc < %s -show-mc-encoding -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
 ; RUN: llc < %s -show-mc-encoding -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
+; RUN: llc < %s -show-mc-encoding -fast-isel -mtriple=x86_64-unknown-unknown-gnux32 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X32,SSE,X32-SSE
+; RUN: llc < %s -show-mc-encoding -fast-isel -mtriple=x86_64-unknown-unknown-gnux32 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X32,AVX,X32-AVX,AVX1,X32-AVX1
+; RUN: llc < %s -show-mc-encoding -fast-isel -mtriple=x86_64-unknown-unknown-gnux32 -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X32,AVX,X32-AVX,AVX512,X32-AVX512
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c
 
@@ -473,6 +476,11 @@ define void @test_mm_clflush(i8* %a0) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    clflush (%rdi) # encoding: [0x0f,0xae,0x3f]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X32-LABEL: test_mm_clflush:
+; X32:       # %bb.0:
+; X32-NEXT:    clflush (%edi) # encoding: [0x67,0x0f,0xae,0x3f]
+; X32-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.sse2.clflush(i8* %a0)
   ret void
 }
@@ -1497,6 +1505,10 @@ define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
 ; X64-LABEL: test_mm_cvtsd_f64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X32-LABEL: test_mm_cvtsd_f64:
+; X32:       # %bb.0:
+; X32-NEXT:    retq # encoding: [0xc3]
   %res = extractelement <2 x double> %a0, i32 0
   ret double %res
 }
@@ -1574,6 +1586,21 @@ define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vcvtsd2ss (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x5a,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_cvtsd_ss_load:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    cvtsd2ss (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x5a,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_cvtsd_ss_load:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vcvtsd2ss (%edi), %xmm0, %xmm0 # encoding: [0x67,0xc5,0xfb,0x5a,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_cvtsd_ss_load:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vcvtsd2ss (%edi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x5a,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %a1 = load <2 x double>, <2 x double>* %p1
   %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
   ret <4 x float> %res
@@ -1629,6 +1656,21 @@ define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2a,0xc7]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_cvtsi32_sd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    cvtsi2sd %edi, %xmm0 # encoding: [0xf2,0x0f,0x2a,0xc7]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_cvtsi32_sd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x2a,0xc7]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_cvtsi32_sd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2a,0xc7]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %cvt = sitofp i32 %a1 to double
   %res = insertelement <2 x double> %a0, double %cvt, i32 0
   ret <2 x double> %res
@@ -1667,6 +1709,21 @@ define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_cvtsi32_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_cvtsi32_si128:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_cvtsi32_si128:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
   %res1 = insertelement <4 x i32> %res0, i32 0, i32 1
   %res2 = insertelement <4 x i32> %res1, i32 0, i32 2
@@ -1856,6 +1913,21 @@ define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x01]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_insert_epi16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # encoding: [0x66,0x0f,0xc4,0xc7,0x01]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_insert_epi16:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc7,0x01]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_insert_epi16:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x01]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res = insertelement <8 x i16> %arg0, i16 %a1,i32 1
   %bc = bitcast <8 x i16> %res to <2 x i64>
@@ -1905,6 +1977,21 @@ define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_load_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movaps (%edi), %xmm0 # encoding: [0x67,0x0f,0x28,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_load_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovaps (%edi), %xmm0 # encoding: [0x67,0xc5,0xf8,0x28,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_load_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovaps (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x28,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   %res = load <2 x double>, <2 x double>* %arg0, align 16
   ret <2 x double> %res
@@ -1949,6 +2036,24 @@ define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
 ; X64-AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = mem[0],zero
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_load_sd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_load_sd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovsd (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x10,0x07]
+; X32-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_load_sd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovsd (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
+; X32-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load double, double* %a0, align 1
   %res0 = insertelement <2 x double> undef, double %ld, i32 0
   %res1 = insertelement <2 x double> %res0, double 0.0, i32 1
@@ -1988,6 +2093,21 @@ define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_load_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movaps (%edi), %xmm0 # encoding: [0x67,0x0f,0x28,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_load_si128:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovaps (%edi), %xmm0 # encoding: [0x67,0xc5,0xf8,0x28,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_load_si128:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovaps (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x28,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res = load <2 x i64>, <2 x i64>* %a0, align 16
   ret <2 x i64> %res
 }
@@ -2035,6 +2155,26 @@ define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
 ; X64-AVX512-NEXT:    vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = mem[0,0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_load1_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_load1_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovddup (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x12,0x07]
+; X32-AVX1-NEXT:    # xmm0 = mem[0,0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_load1_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovddup (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x12,0x07]
+; X32-AVX512-NEXT:    # xmm0 = mem[0,0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load double, double* %a0, align 8
   %res0 = insertelement <2 x double> undef, double %ld, i32 0
   %res1 = insertelement <2 x double> %res0, double %ld, i32 1
@@ -2080,6 +2220,24 @@ define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
 ; X64-AVX512-NEXT:    vmovhps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadh_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movhps (%edi), %xmm0 # encoding: [0x67,0x0f,0x16,0x07]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadh_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovhps (%edi), %xmm0, %xmm0 # encoding: [0x67,0xc5,0xf8,0x16,0x07]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadh_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovhps (%edi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x16,0x07]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load double, double* %a1, align 8
   %res = insertelement <2 x double> %a0, double %ld, i32 1
   ret <2 x double> %res
@@ -2124,6 +2282,24 @@ define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
 ; X64-AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = mem[0],zero
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadl_epi64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadl_epi64:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovsd (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x10,0x07]
+; X32-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadl_epi64:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovsd (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
+; X32-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %bc = bitcast <2 x i64>* %a1 to i64*
   %ld = load i64, i64* %bc, align 1
   %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
@@ -2170,6 +2346,24 @@ define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
 ; X64-AVX512-NEXT:    vmovlps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x12,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadl_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlps (%edi), %xmm0 # encoding: [0x67,0x0f,0x12,0x07]
+; X32-SSE-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadl_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovlps (%edi), %xmm0, %xmm0 # encoding: [0x67,0xc5,0xf8,0x12,0x07]
+; X32-AVX1-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadl_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovlps (%edi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x12,0x07]
+; X32-AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load double, double* %a1, align 8
   %res = insertelement <2 x double> %a0, double %ld, i32 0
   ret <2 x double> %res
@@ -2216,6 +2410,25 @@ define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
 ; X64-AVX512-NEXT:    vpermilpd $1, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0x07,0x01]
 ; X64-AVX512-NEXT:    # xmm0 = mem[1,0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadr_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movaps (%edi), %xmm0 # encoding: [0x67,0x0f,0x28,0x07]
+; X32-SSE-NEXT:    shufps $78, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x4e]
+; X32-SSE-NEXT:    # xmm0 = xmm0[2,3,0,1]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadr_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vpermilpd $1, (%edi), %xmm0 # encoding: [0x67,0xc4,0xe3,0x79,0x05,0x07,0x01]
+; X32-AVX1-NEXT:    # xmm0 = mem[1,0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadr_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpermilpd $1, (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc4,0xe3,0x79,0x05,0x07,0x01]
+; X32-AVX512-NEXT:    # xmm0 = mem[1,0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   %ld = load <2 x double>, <2 x double>* %arg0, align 16
   %res = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -2255,6 +2468,21 @@ define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadu_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movups (%edi), %xmm0 # encoding: [0x67,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadu_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovups (%edi), %xmm0 # encoding: [0x67,0xc5,0xf8,0x10,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadu_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovups (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x10,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   %res = load <2 x double>, <2 x double>* %arg0, align 1
   ret <2 x double> %res
@@ -2293,6 +2521,21 @@ define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadu_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movups (%edi), %xmm0 # encoding: [0x67,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadu_si128:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovups (%edi), %xmm0 # encoding: [0x67,0xc5,0xf8,0x10,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadu_si128:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovups (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x10,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res = load <2 x i64>, <2 x i64>* %a0, align 1
   ret <2 x i64> %res
 }
@@ -2336,6 +2579,24 @@ define <2 x i64> @test_mm_loadu_si64(i8* nocapture readonly %A) {
 ; X64-AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = mem[0],zero
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadu_si64:
+; X32-SSE:       # %bb.0: # %entry
+; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadu_si64:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovsd (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x10,0x07]
+; X32-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadu_si64:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovsd (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
+; X32-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %__v.i = bitcast i8* %A to i64*
   %0 = load i64, i64* %__v.i, align 1
@@ -2382,6 +2643,24 @@ define <2 x i64> @test_mm_loadu_si32(i8* nocapture readonly %A) {
 ; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadu_si32:
+; X32-SSE:       # %bb.0: # %entry
+; X32-SSE-NEXT:    movss (%edi), %xmm0 # encoding: [0x67,0xf3,0x0f,0x10,0x07]
+; X32-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadu_si32:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovss (%edi), %xmm0 # encoding: [0x67,0xc5,0xfa,0x10,0x07]
+; X32-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadu_si32:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovss (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfa,0x10,0x07]
+; X32-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %__v.i = bitcast i8* %A to i32*
   %0 = load i32, i32* %__v.i, align 1
@@ -2429,6 +2708,24 @@ define <2 x i64> @test_mm_loadu_si16(i8* nocapture readonly %A) {
 ; X64-AVX512-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; X64-AVX512-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_loadu_si16:
+; X32-SSE:       # %bb.0: # %entry
+; X32-SSE-NEXT:    movzwl (%edi), %eax # encoding: [0x67,0x0f,0xb7,0x07]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_loadu_si16:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    movzwl (%edi), %eax # encoding: [0x67,0x0f,0xb7,0x07]
+; X32-AVX1-NEXT:    vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_loadu_si16:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    movzwl (%edi), %eax # encoding: [0x67,0x0f,0xb7,0x07]
+; X32-AVX512-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %__v.i = bitcast i8* %A to i16*
   %0 = load i16, i16* %__v.i, align 1
@@ -2486,6 +2783,18 @@ define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) noun
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vmaskmovdqu %xmm1, %xmm0 # encoding: [0xc5,0xf9,0xf7,0xc1]
 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_maskmoveu_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    # kill: def $edi killed $edi killed $rdi
+; X32-SSE-NEXT:    addr32 maskmovdqu %xmm1, %xmm0 # encoding: [0x67,0x66,0x0f,0xf7,0xc1]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX-LABEL: test_mm_maskmoveu_si128:
+; X32-AVX:       # %bb.0:
+; X32-AVX-NEXT:    # kill: def $edi killed $edi killed $rdi
+; X32-AVX-NEXT:    addr32 vmaskmovdqu %xmm1, %xmm0 # encoding: [0x67,0xc5,0xf9,0xf7,0xc1]
+; X32-AVX-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
   call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %arg0, <16 x i8> %arg1, i8* %a2)
@@ -3300,6 +3609,144 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
 ; X64-AVX512-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
 ; X64-AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_epi8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6]
+; X32-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; X32-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT:    movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1]
+; X32-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
+; X32-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10]
+; X32-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE-NEXT:    punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE-NEXT:    punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20]
+; X32-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30]
+; X32-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
+; X32-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE-NEXT:    punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda]
+; X32-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
+; X32-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48]
+; X32-SSE-NEXT:    movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-SSE-NEXT:    punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X32-SSE-NEXT:    punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_epi8:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
+; X32-AVX1-NEXT:    vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX1-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
+; X32-AVX1-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
+; X32-AVX1-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30]
+; X32-AVX1-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28]
+; X32-AVX1-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20]
+; X32-AVX1-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18]
+; X32-AVX1-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10]
+; X32-AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08]
+; X32-AVX1-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09]
+; X32-AVX1-NEXT:    movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1]
+; X32-AVX1-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; X32-AVX1-NEXT:    movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0]
+; X32-AVX1-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b]
+; X32-AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; X32-AVX1-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; X32-AVX1-NEXT:    movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2]
+; X32-AVX1-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d]
+; X32-AVX1-NEXT:    movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6]
+; X32-AVX1-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; X32-AVX1-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
+; X32-AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_epi8:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb6,0x54,0x24,0x48]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
+; X32-AVX512-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX512-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc3,0x79,0x20,0xc2,0x01]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
+; X32-AVX512-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
+; X32-AVX512-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30]
+; X32-AVX512-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28]
+; X32-AVX512-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20]
+; X32-AVX512-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18]
+; X32-AVX512-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10]
+; X32-AVX512-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08]
+; X32-AVX512-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09]
+; X32-AVX512-NEXT:    movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1]
+; X32-AVX512-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; X32-AVX512-NEXT:    movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0]
+; X32-AVX512-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b]
+; X32-AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; X32-AVX512-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; X32-AVX512-NEXT:    movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2]
+; X32-AVX512-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d]
+; X32-AVX512-NEXT:    movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6]
+; X32-AVX512-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; X32-AVX512-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
+; X32-AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <16 x i8> undef,  i8 %a15, i32 0
   %res1  = insertelement <16 x i8> %res0,  i8 %a14, i32 1
   %res2  = insertelement <16 x i8> %res1,  i8 %a13, i32 2
@@ -3450,6 +3897,62 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
 ; X64-AVX512-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x06]
 ; X64-AVX512-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_epi16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10]
+; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08]
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    movd %esi, %xmm1 # encoding: [0x66,0x0f,0x6e,0xce]
+; X32-SSE-NEXT:    punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE-NEXT:    movd %edx, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc2]
+; X32-SSE-NEXT:    movd %ecx, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd1]
+; X32-SSE-NEXT:    punpcklwd %xmm0, %xmm2 # encoding: [0x66,0x0f,0x61,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE-NEXT:    punpckldq %xmm1, %xmm2 # encoding: [0x66,0x0f,0x62,0xd1]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE-NEXT:    movd %r8d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movd %r9d, %xmm1 # encoding: [0x66,0x41,0x0f,0x6e,0xc9]
+; X32-SSE-NEXT:    punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X32-SSE-NEXT:    movd %r10d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc2]
+; X32-SSE-NEXT:    punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X32-SSE-NEXT:    punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    punpcklqdq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc2]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_epi16:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10]
+; X32-AVX1-NEXT:    movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08]
+; X32-AVX1-NEXT:    vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX1-NEXT:    vpinsrw $1, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x01]
+; X32-AVX1-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x02]
+; X32-AVX1-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x03]
+; X32-AVX1-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X32-AVX1-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x05]
+; X32-AVX1-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x06]
+; X32-AVX1-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_epi16:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10]
+; X32-AVX512-NEXT:    movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08]
+; X32-AVX512-NEXT:    vmovd %eax, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX512-NEXT:    vpinsrw $1, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x01]
+; X32-AVX512-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x02]
+; X32-AVX512-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x03]
+; X32-AVX512-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X32-AVX512-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x05]
+; X32-AVX512-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x06]
+; X32-AVX512-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <8 x i16> undef, i16 %a7, i32 0
   %res1  = insertelement <8 x i16> %res0, i16 %a6, i32 1
   %res2  = insertelement <8 x i16> %res1, i16 %a5, i32 2
@@ -3528,6 +4031,36 @@ define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
 ; X64-AVX512-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02]
 ; X64-AVX512-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_epi32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    movd %esi, %xmm1 # encoding: [0x66,0x0f,0x6e,0xce]
+; X32-SSE-NEXT:    punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movd %edx, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd2]
+; X32-SSE-NEXT:    movd %ecx, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc1]
+; X32-SSE-NEXT:    punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_epi32:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1]
+; X32-AVX1-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x01]
+; X32-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02]
+; X32-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_epi32:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovd %ecx, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
+; X32-AVX512-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x01]
+; X32-AVX512-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x02]
+; X32-AVX512-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <4 x i32> undef, i32 %a3, i32 0
   %res1  = insertelement <4 x i32> %res0, i32 %a2, i32 1
   %res2  = insertelement <4 x i32> %res1, i32 %a1, i32 2
@@ -3598,6 +4131,30 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
 ; X64-AVX512-NEXT:    vpunpcklqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0]
 ; X64-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_epi64x:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movq %rdi, %xmm1 # encoding: [0x66,0x48,0x0f,0x6e,0xcf]
+; X32-SSE-NEXT:    movq %rsi, %xmm0 # encoding: [0x66,0x48,0x0f,0x6e,0xc6]
+; X32-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_epi64x:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovq %rdi, %xmm0 # encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    vmovq %rsi, %xmm1 # encoding: [0xc4,0xe1,0xf9,0x6e,0xce]
+; X32-AVX1-NEXT:    vpunpcklqdq %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x6c,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_epi64x:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovq %rdi, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
+; X32-AVX512-NEXT:    vmovq %rsi, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xce]
+; X32-AVX512-NEXT:    vpunpcklqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x i64> undef, i64 %a1, i32 0
   %res1  = insertelement <2 x i64> %res0, i64 %a0, i32 1
   ret <2 x i64> %res1
@@ -3652,6 +4209,25 @@ define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
 ; X64-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
 ; X64-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlhps %xmm0, %xmm1 # encoding: [0x0f,0x16,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0]
+; X32-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0x16,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x double> undef, double %a1, i32 0
   %res1  = insertelement <2 x double> %res0, double %a0, i32 1
   ret <2 x double> %res1
@@ -3699,6 +4275,24 @@ define <2 x double> @test_mm_set_pd1(double %a0) nounwind {
 ; X64-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_pd1:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_pd1:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_pd1:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x double> undef, double %a0, i32 0
   %res1  = insertelement <2 x double> %res0, double %a0, i32 1
   ret <2 x double> %res1
@@ -3746,6 +4340,24 @@ define <2 x double> @test_mm_set_sd(double %a0) nounwind {
 ; X64-AVX512-NEXT:    vmovq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0],zero
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set_sd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movq %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x7e,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],zero
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set_sd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovq %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x7e,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0],zero
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set_sd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0],zero
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x double> undef, double %a0, i32 0
   %res1  = insertelement <2 x double> %res0, double 0.0, i32 1
   ret <2 x double> %res1
@@ -3802,6 +4414,31 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vpbroadcastb %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc7]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set1_epi8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm0 # encoding: [0x66,0x0f,0x60,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X32-SSE-NEXT:    pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT:    pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set1_epi8:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
+; X32-AVX1-NEXT:    vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0]
+; X32-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0xef,0xc9]
+; X32-AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x00,0xc1]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set1_epi8:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpbroadcastb %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc7]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <16 x i8> undef,  i8 %a0, i32 0
   %res1  = insertelement <16 x i8> %res0,  i8 %a0, i32 1
   %res2  = insertelement <16 x i8> %res1,  i8 %a0, i32 2
@@ -3871,6 +4508,29 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vpbroadcastw %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc7]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set1_epi16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-SSE-NEXT:    pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set1_epi16:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; X32-AVX1-NEXT:    vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set1_epi16:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpbroadcastw %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc7]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <8 x i16> undef, i16 %a0, i32 0
   %res1  = insertelement <8 x i16> %res0, i16 %a0, i32 1
   %res2  = insertelement <8 x i16> %res1, i16 %a0, i32 2
@@ -3924,6 +4584,25 @@ define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vpbroadcastd %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xc7]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set1_epi32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set1_epi32:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set1_epi32:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpbroadcastd %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xc7]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <4 x i32> undef, i32 %a0, i32 0
   %res1  = insertelement <4 x i32> %res0, i32 %a0, i32 1
   %res2  = insertelement <4 x i32> %res1, i32 %a0, i32 2
@@ -3982,6 +4661,25 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vpbroadcastq %rdi, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xc7]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set1_epi64x:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movq %rdi, %xmm0 # encoding: [0x66,0x48,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,1,0,1]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set1_epi64x:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovq %rdi, %xmm0 # encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,1,0,1]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set1_epi64x:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpbroadcastq %rdi, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xc7]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x i64> undef, i64 %a0, i32 0
   %res1  = insertelement <2 x i64> %res0, i64 %a0, i32 1
   ret <2 x i64> %res1
@@ -4029,6 +4727,24 @@ define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
 ; X64-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_set1_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_set1_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_set1_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x double> undef, double %a0, i32 0
   %res1  = insertelement <2 x double> %res0, double %a0, i32 1
   ret <2 x double> %res1
@@ -4310,6 +5026,144 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %
 ; X64-AVX512-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x50]
 ; X64-AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_setr_epi8:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48]
+; X32-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
+; X32-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    punpcklwd %xmm1, %xmm2 # encoding: [0x66,0x0f,0x61,0xd1]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28]
+; X32-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
+; X32-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18]
+; X32-SSE-NEXT:    movd %eax, %xmm1 # encoding: [0x66,0x0f,0x6e,0xc8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm1 # encoding: [0x66,0x0f,0x60,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-SSE-NEXT:    punpcklwd %xmm3, %xmm1 # encoding: [0x66,0x0f,0x61,0xcb]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; X32-SSE-NEXT:    punpckldq %xmm2, %xmm1 # encoding: [0x66,0x0f,0x62,0xca]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08]
+; X32-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0]
+; X32-SSE-NEXT:    movd %eax, %xmm3 # encoding: [0x66,0x0f,0x6e,0xd8]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm3 # encoding: [0x66,0x0f,0x60,0xd8]
+; X32-SSE-NEXT:    # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; X32-SSE-NEXT:    punpcklwd %xmm2, %xmm3 # encoding: [0x66,0x0f,0x61,0xda]
+; X32-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X32-SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2]
+; X32-SSE-NEXT:    movd %eax, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklbw %xmm0, %xmm2 # encoding: [0x66,0x0f,0x60,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X32-SSE-NEXT:    movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6]
+; X32-SSE-NEXT:    movd %eax, %xmm4 # encoding: [0x66,0x0f,0x6e,0xe0]
+; X32-SSE-NEXT:    movzbl %dil, %eax # encoding: [0x40,0x0f,0xb6,0xc7]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    punpcklbw %xmm4, %xmm0 # encoding: [0x66,0x0f,0x60,0xc4]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; X32-SSE-NEXT:    punpcklwd %xmm2, %xmm0 # encoding: [0x66,0x0f,0x61,0xc2]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X32-SSE-NEXT:    punpckldq %xmm3, %xmm0 # encoding: [0x66,0x0f,0x62,0xc3]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_setr_epi8:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6]
+; X32-AVX1-NEXT:    movzbl %dil, %esi # encoding: [0x40,0x0f,0xb6,0xf7]
+; X32-AVX1-NEXT:    vmovd %esi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc6]
+; X32-AVX1-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X32-AVX1-NEXT:    movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2]
+; X32-AVX1-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X32-AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; X32-AVX1-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03]
+; X32-AVX1-NEXT:    movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0]
+; X32-AVX1-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X32-AVX1-NEXT:    movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1]
+; X32-AVX1-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08]
+; X32-AVX1-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10]
+; X32-AVX1-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18]
+; X32-AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20]
+; X32-AVX1-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28]
+; X32-AVX1-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30]
+; X32-AVX1-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
+; X32-AVX1-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
+; X32-AVX1-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48]
+; X32-AVX1-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; X32-AVX1-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
+; X32-AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_setr_epi8:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    movzbl %sil, %eax # encoding: [0x40,0x0f,0xb6,0xc6]
+; X32-AVX512-NEXT:    movzbl %dil, %esi # encoding: [0x40,0x0f,0xb6,0xf7]
+; X32-AVX512-NEXT:    vmovd %esi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
+; X32-AVX512-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x01]
+; X32-AVX512-NEXT:    movzbl %dl, %eax # encoding: [0x0f,0xb6,0xc2]
+; X32-AVX512-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
+; X32-AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
+; X32-AVX512-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x03]
+; X32-AVX512-NEXT:    movzbl %r8b, %eax # encoding: [0x41,0x0f,0xb6,0xc0]
+; X32-AVX512-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; X32-AVX512-NEXT:    movzbl %r9b, %eax # encoding: [0x41,0x0f,0xb6,0xc1]
+; X32-AVX512-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x05]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x08]
+; X32-AVX512-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x10]
+; X32-AVX512-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x07]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x18]
+; X32-AVX512-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x20]
+; X32-AVX512-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x09]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x28]
+; X32-AVX512-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x30]
+; X32-AVX512-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0b]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x38]
+; X32-AVX512-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x40]
+; X32-AVX512-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0d]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x48]
+; X32-AVX512-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
+; X32-AVX512-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb6,0x44,0x24,0x50]
+; X32-AVX512-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0f]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <16 x i8> undef,  i8 %a0 , i32 0
   %res1  = insertelement <16 x i8> %res0,  i8 %a1 , i32 1
   %res2  = insertelement <16 x i8> %res1,  i8 %a2 , i32 2
@@ -4460,6 +5314,62 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
 ; X64-AVX512-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
 ; X64-AVX512-NEXT:    vpinsrw $7, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_setr_epi16:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x10]
+; X32-SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x08]
+; X32-SSE-NEXT:    movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0]
+; X32-SSE-NEXT:    movd %r10d, %xmm1 # encoding: [0x66,0x41,0x0f,0x6e,0xca]
+; X32-SSE-NEXT:    punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE-NEXT:    movd %r9d, %xmm0 # encoding: [0x66,0x41,0x0f,0x6e,0xc1]
+; X32-SSE-NEXT:    movd %r8d, %xmm2 # encoding: [0x66,0x41,0x0f,0x6e,0xd0]
+; X32-SSE-NEXT:    punpcklwd %xmm0, %xmm2 # encoding: [0x66,0x0f,0x61,0xd0]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; X32-SSE-NEXT:    punpckldq %xmm1, %xmm2 # encoding: [0x66,0x0f,0x62,0xd1]
+; X32-SSE-NEXT:    # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X32-SSE-NEXT:    movd %ecx, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc1]
+; X32-SSE-NEXT:    movd %edx, %xmm1 # encoding: [0x66,0x0f,0x6e,0xca]
+; X32-SSE-NEXT:    punpcklwd %xmm0, %xmm1 # encoding: [0x66,0x0f,0x61,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X32-SSE-NEXT:    movd %esi, %xmm3 # encoding: [0x66,0x0f,0x6e,0xde]
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    punpcklwd %xmm3, %xmm0 # encoding: [0x66,0x0f,0x61,0xc3]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; X32-SSE-NEXT:    punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE-NEXT:    punpcklqdq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc2]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_setr_epi16:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10]
+; X32-AVX1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08]
+; X32-AVX1-NEXT:    vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    vpinsrw $1, %esi, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc6,0x01]
+; X32-AVX1-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc2,0x02]
+; X32-AVX1-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
+; X32-AVX1-NEXT:    vpinsrw $4, %r8d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04]
+; X32-AVX1-NEXT:    vpinsrw $5, %r9d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05]
+; X32-AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X32-AVX1-NEXT:    vpinsrw $7, %r10d, %xmm0, %xmm0 # encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_setr_epi16:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    movzwl {{[0-9]+}}(%esp), %r10d # encoding: [0x67,0x44,0x0f,0xb7,0x54,0x24,0x10]
+; X32-AVX512-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x67,0x0f,0xb7,0x44,0x24,0x08]
+; X32-AVX512-NEXT:    vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX512-NEXT:    vpinsrw $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc6,0x01]
+; X32-AVX512-NEXT:    vpinsrw $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x02]
+; X32-AVX512-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc1,0x03]
+; X32-AVX512-NEXT:    vpinsrw $4, %r8d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc0,0x04]
+; X32-AVX512-NEXT:    vpinsrw $5, %r9d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc1,0x05]
+; X32-AVX512-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X32-AVX512-NEXT:    vpinsrw $7, %r10d, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0xc4,0xc2,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <8 x i16> undef, i16 %a0, i32 0
   %res1  = insertelement <8 x i16> %res0, i16 %a1, i32 1
   %res2  = insertelement <8 x i16> %res1, i16 %a2, i32 2
@@ -4538,6 +5448,36 @@ define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwin
 ; X64-AVX512-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
 ; X64-AVX512-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x03]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_setr_epi32:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movd %ecx, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc1]
+; X32-SSE-NEXT:    movd %edx, %xmm1 # encoding: [0x66,0x0f,0x6e,0xca]
+; X32-SSE-NEXT:    punpckldq %xmm0, %xmm1 # encoding: [0x66,0x0f,0x62,0xc8]
+; X32-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-SSE-NEXT:    movd %esi, %xmm2 # encoding: [0x66,0x0f,0x6e,0xd6]
+; X32-SSE-NEXT:    movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    punpckldq %xmm2, %xmm0 # encoding: [0x66,0x0f,0x62,0xc2]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_setr_epi32:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX1-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x01]
+; X32-AVX1-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; X32-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x03]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_setr_epi32:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovd %edi, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
+; X32-AVX512-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc6,0x01]
+; X32-AVX512-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
+; X32-AVX512-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x03]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <4 x i32> undef, i32 %a0, i32 0
   %res1  = insertelement <4 x i32> %res0, i32 %a1, i32 1
   %res2  = insertelement <4 x i32> %res1, i32 %a2, i32 2
@@ -4608,6 +5548,30 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
 ; X64-AVX512-NEXT:    vpunpcklqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0]
 ; X64-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_setr_epi64x:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movq %rsi, %xmm1 # encoding: [0x66,0x48,0x0f,0x6e,0xce]
+; X32-SSE-NEXT:    movq %rdi, %xmm0 # encoding: [0x66,0x48,0x0f,0x6e,0xc7]
+; X32-SSE-NEXT:    punpcklqdq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x6c,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_setr_epi64x:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovq %rsi, %xmm0 # encoding: [0xc4,0xe1,0xf9,0x6e,0xc6]
+; X32-AVX1-NEXT:    vmovq %rdi, %xmm1 # encoding: [0xc4,0xe1,0xf9,0x6e,0xcf]
+; X32-AVX1-NEXT:    vpunpcklqdq %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x6c,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_setr_epi64x:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovq %rsi, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc6]
+; X32-AVX512-NEXT:    vmovq %rdi, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xcf]
+; X32-AVX512-NEXT:    vpunpcklqdq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x6c,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x i64> undef, i64 %a0, i32 0
   %res1  = insertelement <2 x i64> %res0, i64 %a1, i32 1
   ret <2 x i64> %res1
@@ -4661,6 +5625,24 @@ define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
 ; X64-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_setr_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_setr_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_setr_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <2 x double> undef, double %a0, i32 0
   %res1  = insertelement <2 x double> %res0, double %a1, i32 1
   ret <2 x double> %res1
@@ -5053,6 +6035,21 @@ define double @test_mm_sqrt_sd_scalar(double %a0) nounwind {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_sqrt_sd_scalar:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    sqrtsd %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x51,0xc0]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_sqrt_sd_scalar:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x51,0xc0]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_sqrt_sd_scalar:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %sqrt = call double @llvm.sqrt.f64(double %a0)
   ret double %sqrt
 }
@@ -5334,6 +6331,21 @@ define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_store_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movaps %xmm0, (%edi) # encoding: [0x67,0x0f,0x29,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_store_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovaps %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_store_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovaps %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   store <2 x double> %a1, <2 x double>* %arg0, align 16
   ret void
@@ -5384,6 +6396,27 @@ define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,0]
 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_store_pd1:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0]
+; X32-SSE-NEXT:    movaps %xmm0, (%edi) # encoding: [0x67,0x0f,0x29,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_store_pd1:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX1-NEXT:    vmovaps %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_store_pd1:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX512-NEXT:    vmovaps %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double * %a0 to <2 x double>*
   %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
   store <2 x double> %shuf, <2 x double>* %arg0, align 16
@@ -5423,6 +6456,21 @@ define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_store_sd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movsd %xmm0, (%edi) # encoding: [0x67,0xf2,0x0f,0x11,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_store_sd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovsd %xmm0, (%edi) # encoding: [0x67,0xc5,0xfb,0x11,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_store_sd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovsd %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x11,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ext = extractelement <2 x double> %a1, i32 0
   store double %ext, double* %a0, align 1
   ret void
@@ -5461,6 +6509,21 @@ define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_store_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movaps %xmm0, (%edi) # encoding: [0x67,0x0f,0x29,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_store_si128:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovaps %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_store_si128:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovaps %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   store <2 x i64> %a1, <2 x i64>* %a0, align 16
   ret void
 }
@@ -5510,6 +6573,27 @@ define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,0]
 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_store1_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[0,0]
+; X32-SSE-NEXT:    movaps %xmm0, (%edi) # encoding: [0x67,0x0f,0x29,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_store1_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX1-NEXT:    vmovaps %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_store1_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[0,0]
+; X32-AVX512-NEXT:    vmovaps %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x29,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double * %a0 to <2 x double>*
   %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer
   store <2 x double> %shuf, <2 x double>* %arg0, align 16
@@ -5561,6 +6645,27 @@ define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[1,0]
 ; X64-AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storeh_sd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movhlps %xmm0, %xmm0 # encoding: [0x0f,0x12,0xc0]
+; X32-SSE-NEXT:    # xmm0 = xmm0[1,1]
+; X32-SSE-NEXT:    movsd %xmm0, (%edi) # encoding: [0x67,0xf2,0x0f,0x11,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storeh_sd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[1,0]
+; X32-AVX1-NEXT:    vmovsd %xmm0, (%edi) # encoding: [0x67,0xc5,0xfb,0x11,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storeh_sd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[1,0]
+; X32-AVX512-NEXT:    vmovsd %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x11,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ext = extractelement <2 x double> %a1, i32 1
   store double %ext, double* %a0, align 8
   ret void
@@ -5602,6 +6707,24 @@ define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
 ; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storel_epi64:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+; X32-SSE-NEXT:    movq %rax, (%edi) # encoding: [0x67,0x48,0x89,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storel_epi64:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
+; X32-AVX1-NEXT:    movq %rax, (%edi) # encoding: [0x67,0x48,0x89,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storel_epi64:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
+; X32-AVX512-NEXT:    movq %rax, (%edi) # encoding: [0x67,0x48,0x89,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ext = extractelement <2 x i64> %a1, i32 0
   %bc = bitcast <2 x i64> *%a0 to i64*
   store i64 %ext, i64* %bc, align 8
@@ -5641,6 +6764,21 @@ define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storel_sd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movsd %xmm0, (%edi) # encoding: [0x67,0xf2,0x0f,0x11,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storel_sd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovsd %xmm0, (%edi) # encoding: [0x67,0xc5,0xfb,0x11,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storel_sd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovsd %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x11,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ext = extractelement <2 x double> %a1, i32 0
   store double %ext, double* %a0, align 8
   ret void
@@ -5691,6 +6829,27 @@ define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512-NEXT:    # xmm0 = xmm0[1,0]
 ; X64-AVX512-NEXT:    vmovapd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storer_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    shufps $78, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x4e]
+; X32-SSE-NEXT:    # xmm0 = xmm0[2,3,0,1]
+; X32-SSE-NEXT:    movaps %xmm0, (%edi) # encoding: [0x67,0x0f,0x29,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storer_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vpermilpd $1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01]
+; X32-AVX1-NEXT:    # xmm0 = xmm0[1,0]
+; X32-AVX1-NEXT:    vmovapd %xmm0, (%edi) # encoding: [0x67,0xc5,0xf9,0x29,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storer_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01]
+; X32-AVX512-NEXT:    # xmm0 = xmm0[1,0]
+; X32-AVX512-NEXT:    vmovapd %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf9,0x29,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   %shuf = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   store <2 x double> %shuf, <2 x double>* %arg0, align 16
@@ -5730,6 +6889,21 @@ define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storeu_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movups %xmm0, (%edi) # encoding: [0x67,0x0f,0x11,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storeu_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovups %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x11,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storeu_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovups %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x11,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   store <2 x double> %a1, <2 x double>* %arg0, align 1
   ret void
@@ -5768,6 +6942,21 @@ define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storeu_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movups %xmm0, (%edi) # encoding: [0x67,0x0f,0x11,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storeu_si128:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovups %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x11,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storeu_si128:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovups %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x11,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   store <2 x i64> %a1, <2 x i64>* %a0, align 1
   ret void
 }
@@ -5808,6 +6997,24 @@ define void @test_mm_storeu_si64(i8* nocapture %A, <2 x i64> %B) {
 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
 ; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storeu_si64:
+; X32-SSE:       # %bb.0: # %entry
+; X32-SSE-NEXT:    movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+; X32-SSE-NEXT:    movq %rax, (%edi) # encoding: [0x67,0x48,0x89,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storeu_si64:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
+; X32-AVX1-NEXT:    movq %rax, (%edi) # encoding: [0x67,0x48,0x89,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storeu_si64:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
+; X32-AVX512-NEXT:    movq %rax, (%edi) # encoding: [0x67,0x48,0x89,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %vecext.i = extractelement <2 x i64> %B, i32 0
   %__v.i = bitcast i8* %A to i64*
@@ -5854,6 +7061,24 @@ define void @test_mm_storeu_si32(i8* nocapture %A, <2 x i64> %B) {
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
 ; X64-AVX512-NEXT:    movl %eax, (%rdi) # encoding: [0x89,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storeu_si32:
+; X32-SSE:       # %bb.0: # %entry
+; X32-SSE-NEXT:    movd %xmm0, %eax # encoding: [0x66,0x0f,0x7e,0xc0]
+; X32-SSE-NEXT:    movl %eax, (%edi) # encoding: [0x67,0x89,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storeu_si32:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovd %xmm0, %eax # encoding: [0xc5,0xf9,0x7e,0xc0]
+; X32-AVX1-NEXT:    movl %eax, (%edi) # encoding: [0x67,0x89,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storeu_si32:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
+; X32-AVX512-NEXT:    movl %eax, (%edi) # encoding: [0x67,0x89,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = bitcast <2 x i64> %B to <4 x i32>
   %vecext.i = extractelement <4 x i32> %0, i32 0
@@ -5901,6 +7126,24 @@ define void @test_mm_storeu_si16(i8* nocapture %A, <2 x i64> %B) {
 ; X64-AVX512-NEXT:    vmovd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
 ; X64-AVX512-NEXT:    movw %ax, (%rdi) # encoding: [0x66,0x89,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_storeu_si16:
+; X32-SSE:       # %bb.0: # %entry
+; X32-SSE-NEXT:    movd %xmm0, %eax # encoding: [0x66,0x0f,0x7e,0xc0]
+; X32-SSE-NEXT:    movw %ax, (%edi) # encoding: [0x67,0x66,0x89,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_storeu_si16:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovd %xmm0, %eax # encoding: [0xc5,0xf9,0x7e,0xc0]
+; X32-AVX1-NEXT:    movw %ax, (%edi) # encoding: [0x67,0x66,0x89,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_storeu_si16:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovd %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
+; X32-AVX512-NEXT:    movw %ax, (%edi) # encoding: [0x67,0x66,0x89,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = bitcast <2 x i64> %B to <8 x i16>
   %vecext.i = extractelement <8 x i16> %0, i32 0
@@ -5942,6 +7185,21 @@ define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_stream_pd:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movntps %xmm0, (%edi) # encoding: [0x67,0x0f,0x2b,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_stream_pd:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovntps %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x2b,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_stream_pd:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovntps %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x2b,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %arg0 = bitcast double* %a0 to <2 x double>*
   store <2 x double> %a1, <2 x double>* %arg0, align 16, !nontemporal !0
   ret void
@@ -5959,6 +7217,11 @@ define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movntil %esi, (%rdi) # encoding: [0x0f,0xc3,0x37]
 ; X64-NEXT:    retq # encoding: [0xc3]
+;
+; X32-LABEL: test_mm_stream_si32:
+; X32:       # %bb.0:
+; X32-NEXT:    movntil %esi, (%edi) # encoding: [0x67,0x0f,0xc3,0x37]
+; X32-NEXT:    retq # encoding: [0xc3]
   store i32 %a1, i32* %a0, align 1, !nontemporal !0
   ret void
 }
@@ -5996,6 +7259,21 @@ define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
 ; X64-AVX512:       # %bb.0:
 ; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
+;
+; X32-SSE-LABEL: test_mm_stream_si128:
+; X32-SSE:       # %bb.0:
+; X32-SSE-NEXT:    movntps %xmm0, (%edi) # encoding: [0x67,0x0f,0x2b,0x07]
+; X32-SSE-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX1-LABEL: test_mm_stream_si128:
+; X32-AVX1:       # %bb.0:
+; X32-AVX1-NEXT:    vmovntps %xmm0, (%edi) # encoding: [0x67,0xc5,0xf8,0x2b,0x07]
+; X32-AVX1-NEXT:    retq # encoding: [0xc3]
+;
+; X32-AVX512-LABEL: test_mm_stream_si128:
+; X32-AVX512:       # %bb.0:
+; X32-AVX512-NEXT:    vmovntps %xmm0, (%edi) # EVEX TO VEX Compression encoding: [0x67,0xc5,0xf8,0x2b,0x07]
+; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
   ret void
 }

diff  --git a/llvm/test/MC/X86/maskmovdqu.s b/llvm/test/MC/X86/maskmovdqu.s
new file mode 100644
index 0000000000000..685ee99d770fa
--- /dev/null
+++ b/llvm/test/MC/X86/maskmovdqu.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple i386-- --show-encoding %s |\
+// RUN:   FileCheck %s --check-prefixes=CHECK,ENCODING
+
+// RUN: llvm-mc -triple i386-- -filetype=obj %s |\
+// RUN:   llvm-objdump -d - | FileCheck %s
+
+// CHECK-NOT: addr32
+// CHECK: maskmovdqu %xmm1, %xmm0
+// ENCODING:  encoding: [0x66,0x0f,0xf7,0xc1]
+maskmovdqu %xmm1, %xmm0
+
+// CHECK-NOT: addr32
+// CHECK:  vmaskmovdqu %xmm1, %xmm0
+// ENCODING:  encoding: [0xc5,0xf9,0xf7,0xc1]
+vmaskmovdqu %xmm1, %xmm0

diff  --git a/llvm/test/MC/X86/maskmovdqu64.s b/llvm/test/MC/X86/maskmovdqu64.s
new file mode 100644
index 0000000000000..ba840a4c9a642
--- /dev/null
+++ b/llvm/test/MC/X86/maskmovdqu64.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple x86_64-- --show-encoding %s |\
+// RUN:   FileCheck %s --check-prefixes=CHECK,ENCODING
+
+// RUN: llvm-mc -triple x86_64-- -filetype=obj %s |\
+// RUN:   llvm-objdump -d - | FileCheck %s
+
+// CHECK-NOT: addr32
+// CHECK: maskmovdqu %xmm1, %xmm0
+// ENCODING:  encoding: [0x66,0x0f,0xf7,0xc1]
+maskmovdqu %xmm1, %xmm0
+
+// CHECK-NOT: addr32
+// CHECK:  vmaskmovdqu %xmm1, %xmm0
+// ENCODING:  encoding: [0xc5,0xf9,0xf7,0xc1]
+vmaskmovdqu %xmm1, %xmm0
+
+// CHECK:  addr32
+// ENCODING:  encoding: [0x67]
+// CHECK: maskmovdqu %xmm1, %xmm0
+// ENCODING:  encoding: [0x66,0x0f,0xf7,0xc1]
+addr32 maskmovdqu %xmm1, %xmm0
+
+// CHECK:  addr32
+// ENCODING:  encoding: [0x67]
+// CHECK:  vmaskmovdqu %xmm1, %xmm0
+// ENCODING:  encoding: [0xc5,0xf9,0xf7,0xc1]
+addr32 vmaskmovdqu %xmm1, %xmm0

diff  --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 2d25289608f30..89069ec3e4ff9 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -102,7 +102,8 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_ADSIZE:
     return (noPrefix && inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE, noPrefix));
   case IC_64BIT_OPSIZE_ADSIZE:
-    return false;
+    return (noPrefix &&
+            inheritsFrom(child, IC_64BIT_VEX_OPSIZE_ADSIZE, noPrefix));
   case IC_XD:
     return inheritsFrom(child, IC_64BIT_XD);
   case IC_XS:
@@ -123,10 +124,11 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_64BIT_OPSIZE:
     return inheritsFrom(child, IC_64BIT_REXW_OPSIZE) ||
            (!AdSize64 && inheritsFrom(child, IC_64BIT_OPSIZE_ADSIZE)) ||
-           (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE));
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_REXW_ADSIZE)) ||
+           (!AdSize64 && inheritsFrom(child, IC_64BIT_VEX_OPSIZE_ADSIZE));
   case IC_64BIT_XD:
-    return(inheritsFrom(child, IC_64BIT_REXW_XD) ||
-           (!AdSize64 && inheritsFrom(child, IC_64BIT_XD_ADSIZE)));
+    return (inheritsFrom(child, IC_64BIT_REXW_XD) ||
+            (!AdSize64 && inheritsFrom(child, IC_64BIT_XD_ADSIZE)));
   case IC_64BIT_XS:
     return(inheritsFrom(child, IC_64BIT_REXW_XS) ||
            (!AdSize64 && inheritsFrom(child, IC_64BIT_XS_ADSIZE)));
@@ -156,7 +158,12 @@ static inline bool inheritsFrom(InstructionContext child,
   case IC_VEX_OPSIZE:
     return (VEX_LIG && VEX_WIG && inheritsFrom(child, IC_VEX_L_W_OPSIZE)) ||
            (VEX_WIG && inheritsFrom(child, IC_VEX_W_OPSIZE)) ||
-           (VEX_LIG && inheritsFrom(child, IC_VEX_L_OPSIZE));
+           (VEX_LIG && inheritsFrom(child, IC_VEX_L_OPSIZE)) ||
+           inheritsFrom(child, IC_64BIT_VEX_OPSIZE);
+  case IC_64BIT_VEX_OPSIZE:
+    return inheritsFrom(child, IC_64BIT_VEX_OPSIZE_ADSIZE);
+  case IC_64BIT_VEX_OPSIZE_ADSIZE:
+    return false;
   case IC_VEX_W:
     return VEX_LIG && inheritsFrom(child, IC_VEX_L_W);
   case IC_VEX_W_XS:
@@ -881,6 +888,9 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
     if ((index & ATTR_EVEX) || (index & ATTR_VEX) || (index & ATTR_VEXL)) {
       if (index & ATTR_EVEX)
         o << "IC_EVEX";
+      else if ((index & (ATTR_64BIT | ATTR_VEXL | ATTR_REXW | ATTR_OPSIZE)) ==
+               (ATTR_64BIT | ATTR_OPSIZE))
+        o << "IC_64BIT_VEX";
       else
         o << "IC_VEX";
 
@@ -892,9 +902,13 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
       if (index & ATTR_REXW)
         o << "_W";
 
-      if (index & ATTR_OPSIZE)
+      if (index & ATTR_OPSIZE) {
         o << "_OPSIZE";
-      else if (index & ATTR_XD)
+        if ((index & (ATTR_64BIT | ATTR_EVEX | ATTR_VEX | ATTR_VEXL |
+                      ATTR_REXW | ATTR_ADSIZE)) ==
+            (ATTR_64BIT | ATTR_VEX | ATTR_ADSIZE))
+          o << "_ADSIZE";
+      } else if (index & ATTR_XD)
         o << "_XD";
       else if (index & ATTR_XS)
         o << "_XS";
@@ -908,8 +922,7 @@ void DisassemblerTables::emitContextTable(raw_ostream &o, unsigned &i) const {
         if (index & ATTR_EVEXB)
           o << "_B";
       }
-    }
-    else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
+    } else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XS))
       o << "IC_64BIT_REXW_XS";
     else if ((index & ATTR_64BIT) && (index & ATTR_REXW) && (index & ATTR_XD))
       o << "IC_64BIT_REXW_XD";

diff  --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
index e4b7c05cfb881..c2ca3791ac366 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -125,13 +125,7 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
     return;
   }
 
-  // Special case since there is no attribute class for 64-bit and VEX
-  if (Name == "VMASKMOVDQU64") {
-    ShouldBeEmitted = false;
-    return;
-  }
-
-  ShouldBeEmitted  = true;
+  ShouldBeEmitted = true;
 }
 
 void RecognizableInstr::processInstr(DisassemblerTables &tables,
@@ -267,6 +261,11 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_VEX_L_OPSIZE;
     else if (OpPrefix == X86Local::PD && HasVEX_W)
       insnContext = IC_VEX_W_OPSIZE;
+    else if (OpPrefix == X86Local::PD && Is64Bit &&
+             AdSize == X86Local::AdSize32)
+      insnContext = IC_64BIT_VEX_OPSIZE_ADSIZE;
+    else if (OpPrefix == X86Local::PD && Is64Bit)
+      insnContext = IC_64BIT_VEX_OPSIZE;
     else if (OpPrefix == X86Local::PD)
       insnContext = IC_VEX_OPSIZE;
     else if (HasVEX_LPrefix && OpPrefix == X86Local::XS)


        


More information about the llvm-commits mailing list