[llvm] r330895 - [x86] NFC: Add tests for idiomatic usage patterns of SSE4.2 string

Chandler Carruth via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 25 20:12:18 PDT 2018


Author: chandlerc
Date: Wed Apr 25 20:12:17 2018
New Revision: 330895

URL: http://llvm.org/viewvc/llvm-project?rev=330895&view=rev
Log:
[x86] NFC: Add tests for idiomatic usage patterns of SSE4.2 string
comparison instructions (pcmp[ei]stri*).

These will help show improvements from fixes to PR37246.

I've not really covered the mask forms of this intrinsic as I don't have
as good of an intuition about the likely usage patterns there. Happy for
someone to extend this with tests covering the mask form.

Added:
    llvm/trunk/test/CodeGen/X86/sse42.ll

Added: llvm/trunk/test/CodeGen/X86/sse42.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse42.ll?rev=330895&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse42.ll (added)
+++ llvm/trunk/test/CodeGen/X86/sse42.ll Wed Apr 25 20:12:17 2018
@@ -0,0 +1,976 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 -mcpu=x86-64 | FileCheck %s --check-prefix=X64
+
+declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
+declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
+
+define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_reg_eq_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_reg_eq_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_reg_idx_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_reg_idx_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  ret i32 %idx
+}
+
+define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_reg_diff_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl 12(%ebp), %edx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    cmpl $16, %ecx
+; X32-NEXT:    jne .LBB2_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    jmp .LBB2_3
+; X32-NEXT:  .LBB2_2: # %compare
+; X32-NEXT:    movdqa %xmm0, (%esp)
+; X32-NEXT:    andl $15, %ecx
+; X32-NEXT:    movb (%esp,%ecx), %al
+; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    subb 16(%esp,%ecx), %al
+; X32-NEXT:  .LBB2_3: # %exit
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_reg_diff_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $16, %ecx
+; X64-NEXT:    jne .LBB2_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB2_2: # %compare
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    movb -24(%rsp,%rcx), %al
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subb -40(%rsp,%rcx), %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+entry:
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %eq = icmp eq i32 %idx, 16
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
+  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
+  %sub = sub i8 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i8 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpestri_mem_eq_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_mem_eq_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movdqu (%esi), %xmm0
+; X32-NEXT:    movdqu (%ecx), %xmm1
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_mem_eq_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rdx), %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
+  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
+  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
+  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpestri_mem_idx_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_mem_idx_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movdqu (%esi), %xmm0
+; X32-NEXT:    movdqu (%ecx), %xmm1
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_mem_idx_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rdx), %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
+  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
+  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  ret i32 %idx
+}
+
+define i32 @pcmpestri_mem_diff_i8(i8* %lhs_ptr, i32 %lhs_len, i8* %rhs_ptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_mem_diff_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl 20(%ebp), %edx
+; X32-NEXT:    movl 16(%ebp), %ecx
+; X32-NEXT:    movl 8(%ebp), %esi
+; X32-NEXT:    movdqu (%esi), %xmm1
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    pcmpestri $24, %xmm0, %xmm1
+; X32-NEXT:    cmpl $16, %ecx
+; X32-NEXT:    jne .LBB5_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    jmp .LBB5_3
+; X32-NEXT:  .LBB5_2: # %compare
+; X32-NEXT:    movdqa %xmm1, (%esp)
+; X32-NEXT:    andl $15, %ecx
+; X32-NEXT:    movb (%esp,%ecx), %al
+; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    subb 16(%esp,%ecx), %al
+; X32-NEXT:  .LBB5_3: # %exit
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_mem_diff_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm1
+; X64-NEXT:    movdqu (%rdx), %xmm0
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $16, %ecx
+; X64-NEXT:    jne .LBB5_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB5_2: # %compare
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    movb -24(%rsp,%rcx), %al
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subb -40(%rsp,%rcx), %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
+  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
+  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
+  %eq = icmp eq i32 %idx, 16
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
+  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
+  %sub = sub i8 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i8 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_reg_eq_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_reg_eq_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_reg_idx_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_reg_idx_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
+  ret i32 %idx
+}
+
+define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_reg_diff_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl 12(%ebp), %edx
+; X32-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X32-NEXT:    cmpl $16, %ecx
+; X32-NEXT:    jne .LBB8_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    jmp .LBB8_3
+; X32-NEXT:  .LBB8_2: # %compare
+; X32-NEXT:    movdqa %xmm0, (%esp)
+; X32-NEXT:    addl %ecx, %ecx
+; X32-NEXT:    andl $14, %ecx
+; X32-NEXT:    movzwl (%esp,%ecx), %eax
+; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    subw 16(%esp,%ecx), %ax
+; X32-NEXT:  .LBB8_3: # %exit
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_reg_diff_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %edx
+; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $16, %ecx
+; X64-NEXT:    jne .LBB8_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB8_2: # %compare
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
+  %eq = icmp eq i32 %idx, 16
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
+  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
+  %sub = sub i16 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i16 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpestri_mem_eq_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_mem_eq_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movdqu (%esi), %xmm0
+; X32-NEXT:    movdqu (%ecx), %xmm1
+; X32-NEXT:    pcmpestri $25, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_mem_eq_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rdx), %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    pcmpestri $25, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
+  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
+  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpestri_mem_idx_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_mem_idx_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movdqu (%esi), %xmm0
+; X32-NEXT:    movdqu (%ecx), %xmm1
+; X32-NEXT:    pcmpestri $25, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_mem_idx_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rdx), %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    pcmpestri $25, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
+  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
+  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
+  ret i32 %idx
+}
+
+define i32 @pcmpestri_mem_diff_i16(i16* %lhs_ptr, i32 %lhs_len, i16* %rhs_ptr, i32 %rhs_len) nounwind {
+; X32-LABEL: pcmpestri_mem_diff_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl 20(%ebp), %edx
+; X32-NEXT:    movl 16(%ebp), %ecx
+; X32-NEXT:    movl 8(%ebp), %esi
+; X32-NEXT:    movdqu (%esi), %xmm1
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    pcmpestri $25, %xmm0, %xmm1
+; X32-NEXT:    cmpl $8, %ecx
+; X32-NEXT:    jne .LBB11_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    jmp .LBB11_3
+; X32-NEXT:  .LBB11_2: # %compare
+; X32-NEXT:    movdqa %xmm1, (%esp)
+; X32-NEXT:    addl %ecx, %ecx
+; X32-NEXT:    andl $14, %ecx
+; X32-NEXT:    movzwl (%esp,%ecx), %eax
+; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    subw 16(%esp,%ecx), %ax
+; X32-NEXT:  .LBB11_3: # %exit
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    leal -4(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpestri_mem_diff_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm1
+; X64-NEXT:    movdqu (%rdx), %xmm0
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %ecx, %edx
+; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $8, %ecx
+; X64-NEXT:    jne .LBB11_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB11_2: # %compare
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
+  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
+  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
+  %eq = icmp eq i32 %idx, 8
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
+  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
+  %sub = sub i16 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i16 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
+; X32-LABEL: pcmpistri_reg_eq_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_reg_eq_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
+; X32-LABEL: pcmpistri_reg_idx_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_reg_idx_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  ret i32 %idx
+}
+
+define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
+; X32-LABEL: pcmpistri_reg_diff_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    cmpl $16, %ecx
+; X32-NEXT:    jne .LBB14_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB14_2: # %compare
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movdqa %xmm0, (%esp)
+; X32-NEXT:    andl $15, %ecx
+; X32-NEXT:    movb (%esp,%ecx), %al
+; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    subb 16(%esp,%ecx), %al
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_reg_diff_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $16, %ecx
+; X64-NEXT:    jne .LBB14_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB14_2: # %compare
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    movb -24(%rsp,%rcx), %al
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subb -40(%rsp,%rcx), %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+entry:
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %eq = icmp eq i32 %idx, 16
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
+  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
+  %sub = sub i8 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i8 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpistri_mem_eq_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
+; X32-LABEL: pcmpistri_mem_eq_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    movdqu (%eax), %xmm1
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_mem_eq_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rsi), %xmm1
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
+  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
+  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
+  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpistri_mem_idx_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
+; X32-LABEL: pcmpistri_mem_idx_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    movdqu (%eax), %xmm1
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_mem_idx_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rsi), %xmm1
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
+  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
+  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  ret i32 %idx
+}
+
+define i32 @pcmpistri_mem_diff_i8(i8* %lhs_ptr, i8* %rhs_ptr) nounwind {
+; X32-LABEL: pcmpistri_mem_diff_i8:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl 8(%ebp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm1
+; X32-NEXT:    movdqu (%eax), %xmm0
+; X32-NEXT:    pcmpistri $24, %xmm0, %xmm1
+; X32-NEXT:    cmpl $16, %ecx
+; X32-NEXT:    jne .LBB17_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    jmp .LBB17_3
+; X32-NEXT:  .LBB17_2: # %compare
+; X32-NEXT:    movdqa %xmm1, (%esp)
+; X32-NEXT:    andl $15, %ecx
+; X32-NEXT:    movb (%esp,%ecx), %al
+; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    subb 16(%esp,%ecx), %al
+; X32-NEXT:  .LBB17_3: # %exit
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_mem_diff_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm1
+; X64-NEXT:    movdqu (%rsi), %xmm0
+; X64-NEXT:    pcmpistri $24, %xmm0, %xmm1
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $16, %ecx
+; X64-NEXT:    jne .LBB17_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB17_2: # %compare
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $15, %ecx
+; X64-NEXT:    movb -24(%rsp,%rcx), %al
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subb -40(%rsp,%rcx), %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i8* %lhs_ptr to <16 x i8>*
+  %lhs = load <16 x i8>, <16 x i8>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i8* %rhs_ptr to <16 x i8>*
+  %rhs = load <16 x i8>, <16 x i8>* %rhs_vptr, align 1
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
+  %eq = icmp eq i32 %idx, 16
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
+  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
+  %sub = sub i8 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i8 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
+; X32-LABEL: pcmpistri_reg_eq_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_reg_eq_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
+; X32-LABEL: pcmpistri_reg_idx_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_reg_idx_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
+  ret i32 %idx
+}
+
+define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
+; X32-LABEL: pcmpistri_reg_diff_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X32-NEXT:    cmpl $16, %ecx
+; X32-NEXT:    jne .LBB20_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    retl
+; X32-NEXT:  .LBB20_2: # %compare
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movdqa %xmm0, (%esp)
+; X32-NEXT:    addl %ecx, %ecx
+; X32-NEXT:    andl $14, %ecx
+; X32-NEXT:    movzwl (%esp,%ecx), %eax
+; X32-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    subw 16(%esp,%ecx), %ax
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_reg_diff_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $16, %ecx
+; X64-NEXT:    jne .LBB20_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB20_2: # %compare
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
+  %eq = icmp eq i32 %idx, 16
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
+  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
+  %sub = sub i16 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i16 %result to i32
+  ret i32 %result_ext
+}
+
+define i1 @pcmpistri_mem_eq_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
+; X32-LABEL: pcmpistri_mem_eq_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    movdqu (%eax), %xmm1
+; X32-NEXT:    pcmpistri $25, %xmm1, %xmm0
+; X32-NEXT:    setae %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_mem_eq_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rsi), %xmm1
+; X64-NEXT:    pcmpistri $25, %xmm1, %xmm0
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
+  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
+  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
+  %result = icmp eq i32 %c, 0
+  ret i1 %result
+}
+
+define i32 @pcmpistri_mem_idx_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
+; X32-LABEL: pcmpistri_mem_idx_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm0
+; X32-NEXT:    movdqu (%eax), %xmm1
+; X32-NEXT:    pcmpistri $25, %xmm1, %xmm0
+; X32-NEXT:    movl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_mem_idx_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm0
+; X64-NEXT:    movdqu (%rsi), %xmm1
+; X64-NEXT:    pcmpistri $25, %xmm1, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
+  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
+  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
+  ret i32 %idx
+}
+
+define i32 @pcmpistri_mem_diff_i16(i16* %lhs_ptr, i16* %rhs_ptr) nounwind {
+; X32-LABEL: pcmpistri_mem_diff_i16:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    andl $-16, %esp
+; X32-NEXT:    subl $48, %esp
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl 8(%ebp), %ecx
+; X32-NEXT:    movdqu (%ecx), %xmm1
+; X32-NEXT:    movdqu (%eax), %xmm0
+; X32-NEXT:    pcmpistri $25, %xmm0, %xmm1
+; X32-NEXT:    cmpl $8, %ecx
+; X32-NEXT:    jne .LBB23_2
+; X32-NEXT:  # %bb.1:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    jmp .LBB23_3
+; X32-NEXT:  .LBB23_2: # %compare
+; X32-NEXT:    movdqa %xmm1, (%esp)
+; X32-NEXT:    addl %ecx, %ecx
+; X32-NEXT:    andl $14, %ecx
+; X32-NEXT:    movzwl (%esp,%ecx), %eax
+; X32-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    subw 16(%esp,%ecx), %ax
+; X32-NEXT:  .LBB23_3: # %exit
+; X32-NEXT:    movzwl %ax, %eax
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: pcmpistri_mem_diff_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movdqu (%rdi), %xmm1
+; X64-NEXT:    movdqu (%rsi), %xmm0
+; X64-NEXT:    pcmpistri $25, %xmm0, %xmm1
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    cmpl $8, %ecx
+; X64-NEXT:    jne .LBB23_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB23_2: # %compare
+; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    andl $7, %ecx
+; X64-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    subw -40(%rsp,%rcx,2), %ax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    retq
+entry:
+  %lhs_vptr = bitcast i16* %lhs_ptr to <8 x i16>*
+  %lhs = load <8 x i16>, <8 x i16>* %lhs_vptr, align 1
+  %rhs_vptr = bitcast i16* %rhs_ptr to <8 x i16>*
+  %rhs = load <8 x i16>, <8 x i16>* %rhs_vptr, align 1
+  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
+  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
+  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
+  %eq = icmp eq i32 %idx, 8
+  br i1 %eq, label %exit, label %compare
+
+compare:
+  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
+  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
+  %sub = sub i16 %lhs_c, %rhs_c
+  br label %exit
+
+exit:
+  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
+  %result_ext = zext i16 %result to i32
+  ret i32 %result_ext
+}




More information about the llvm-commits mailing list