[llvm] r329604 - [X86] Remove GCCBuiltin name from pmuldq/pmuludq intrinsics so clang can custom lower to native IR. Update fast-isel intrinsic tests for clang's new codegen.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 9 12:17:38 PDT 2018
Author: ctopper
Date: Mon Apr 9 12:17:38 2018
New Revision: 329604
URL: http://llvm.org/viewvc/llvm-project?rev=329604&view=rev
Log:
[X86] Remove GCCBuiltin name from pmuldq/pmuludq intrinsics so clang can custom lower to native IR. Update fast-isel intrinsic tests for clang's new codegen.
In somes cases fast-isel fails to remove the and/shifts and uses blends or conditional moves.
But once masking gets involved, fast-isel aborts on the mask portion and we DAG combine more thorougly.
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsX86.td
llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=329604&r1=329603&r2=329604&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Mon Apr 9 12:17:38 2018
@@ -408,7 +408,7 @@ let TargetPrefix = "x86" in { // All in
def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">,
Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
llvm_v8i16_ty], [IntrNoMem, Commutative]>;
- def int_x86_sse2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq128">,
+ def int_x86_sse2_pmulu_dq : // FIXME: remove this intrinsic
Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty,
llvm_v4i32_ty], [IntrNoMem, Commutative]>;
def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
@@ -805,7 +805,7 @@ let TargetPrefix = "x86" in { // All in
// Vector multiply
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_sse41_pmuldq : GCCBuiltin<"__builtin_ia32_pmuldq128">,
+ def int_x86_sse41_pmuldq : // FIXME: remove this intrinsic
Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
[IntrNoMem, Commutative]>;
}
@@ -1667,10 +1667,10 @@ let TargetPrefix = "x86" in { // All in
def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
- def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">,
+ def int_x86_avx2_pmulu_dq : // FIXME: remove this intrinsic
Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
llvm_v8i32_ty], [IntrNoMem, Commutative]>;
- def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">,
+ def int_x86_avx2_pmul_dq : // FIXME: remove this intrinsic
Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
llvm_v8i32_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
@@ -4783,9 +4783,9 @@ let TargetPrefix = "x86" in {
def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">,
Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
- def int_x86_avx512_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512">,
+ def int_x86_avx512_pmulu_dq_512 : // FIXME: remove this intrinsic
Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
- def int_x86_avx512_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512">,
+ def int_x86_avx512_pmul_dq_512 : // FIXME: remove this intrinsic
Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">,
Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
Modified: llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll?rev=329604&r1=329603&r2=329604&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll Mon Apr 9 12:17:38 2018
@@ -1823,11 +1823,21 @@ declare <16 x i16> @llvm.x86.avx2.mpsadb
define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_mm256_mul_epi32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0
+; CHECK-NEXT: vpsrad $31, %ymm0, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; CHECK-NEXT: vpsllq $32, %ymm1, %ymm1
+; CHECK-NEXT: vpsrad $31, %ymm1, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
- %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
- %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
- %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+ %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
+ %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
+ %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
+ %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
+ %res = mul nsw <4 x i64> %A1, %B1
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
@@ -1835,11 +1845,14 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq
define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_mm256_mul_epu32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
- %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
- %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
- %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1)
+ %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %res = mul nuw <4 x i64> %A, %B
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll?rev=329604&r1=329603&r2=329604&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll Mon Apr 9 12:17:38 2018
@@ -1816,5 +1816,149 @@ define <8 x i64> @test_mm512_zextsi256_s
ret <8 x i64> %res
}
+define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_mul_epi32:
+; X32: # %bb.0:
+; X32-NEXT: vpsllq $32, %zmm0, %zmm0
+; X32-NEXT: vpsraq $32, %zmm0, %zmm0
+; X32-NEXT: vpsllq $32, %zmm1, %zmm1
+; X32-NEXT: vpsraq $32, %zmm1, %zmm1
+; X32-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mul_epi32:
+; X64: # %bb.0:
+; X64-NEXT: vpsllq $32, %zmm0, %zmm0
+; X64-NEXT: vpsraq $32, %zmm0, %zmm0
+; X64-NEXT: vpsllq $32, %zmm1, %zmm1
+; X64-NEXT: vpsraq $32, %zmm1, %zmm1
+; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
+; X64-NEXT: retq
+ %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
+ ret <8 x i64> %tmp4
+}
+
+define <8 x i64> @test_mm512_maskz_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_maskz_mul_epi32:
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_mul_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %conv = trunc i16 %__k to i8
+ %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
+ %tmp5 = bitcast i8 %conv to <8 x i1>
+ %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> zeroinitializer
+ ret <8 x i64> %tmp6
+}
+
+define <8 x i64> @test_mm512_mask_mul_epi32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
+; X32-LABEL: test_mm512_mask_mul_epi32:
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
+; X32-NEXT: vmovdqa64 %zmm2, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_mul_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
+; X64-NEXT: vmovdqa64 %zmm2, %zmm0
+; X64-NEXT: retq
+ %conv = trunc i16 %__k to i8
+ %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
+ %tmp5 = bitcast i8 %conv to <8 x i1>
+ %tmp6 = select <8 x i1> %tmp5, <8 x i64> %tmp4, <8 x i64> %__src
+ ret <8 x i64> %tmp6
+}
+
+define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_mul_epu32:
+; X32: # %bb.0:
+; X32-NEXT: movw $-21846, %ax # imm = 0xAAAA
+; X32-NEXT: kmovw %eax, %k0
+; X32-NEXT: knotw %k0, %k1
+; X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; X32-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; X32-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mul_epu32:
+; X64: # %bb.0:
+; X64-NEXT: movw $-21846, %ax # imm = 0xAAAA
+; X64-NEXT: kmovw %eax, %k0
+; X64-NEXT: knotw %k0, %k1
+; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; X64-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z}
+; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
+; X64-NEXT: retq
+ %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
+ ret <8 x i64> %tmp2
+}
+
+define <8 x i64> @test_mm512_maskz_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
+; X32-LABEL: test_mm512_maskz_mul_epu32:
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_mul_epu32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ %conv = trunc i16 %__k to i8
+ %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
+ %tmp3 = bitcast i8 %conv to <8 x i1>
+ %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> zeroinitializer
+ ret <8 x i64> %tmp4
+}
+
+define <8 x i64> @test_mm512_mask_mul_epu32(i16 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
+; X32-LABEL: test_mm512_mask_mul_epu32:
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
+; X32-NEXT: vmovdqa64 %zmm2, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_mul_epu32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
+; X64-NEXT: vmovdqa64 %zmm2, %zmm0
+; X64-NEXT: retq
+ %conv = trunc i16 %__k to i8
+ %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
+ %tmp3 = bitcast i8 %conv to <8 x i1>
+ %tmp4 = select <8 x i1> %tmp3, <8 x i64> %tmp2, <8 x i64> %__src
+ ret <8 x i64> %tmp4
+}
+
!0 = !{i32 1}
Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll?rev=329604&r1=329603&r2=329604&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll Mon Apr 9 12:17:38 2018
@@ -3215,6 +3215,195 @@ define <8 x float> @test_mm256_maskz_shu
ret <8 x float> %res1
}
+define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_mask_mul_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_mul_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
+ %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
+ %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
+ %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
+ %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
+ %tmp5 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W
+ ret <4 x i64> %tmp6
+}
+
+define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_maskz_mul_epi32:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_mul_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+ %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32>
+ %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32>
+ %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32>
+ %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32>
+ %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1
+ %tmp5 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer
+ ret <4 x i64> %tmp6
+}
+
+define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_mask_mul_epi32:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_mul_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1}
+; X64-NEXT: retq
+ %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
+ %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
+ %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
+ %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
+ %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
+ %tmp5 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W
+ ret <2 x i64> %tmp6
+}
+
+define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_maskz_mul_epi32:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_mul_epi32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+ %tmp = shl <2 x i64> %__X, <i64 32, i64 32>
+ %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32>
+ %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32>
+ %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32>
+ %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1
+ %tmp5 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer
+ ret <2 x i64> %tmp6
+}
+
+define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_mask_mul_epu32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_mul_epu32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
+ %tmp3 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W
+ ret <4 x i64> %tmp4
+}
+
+define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm256_maskz_mul_epu32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_mul_epu32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <4 x i64> %tmp1, %tmp
+ %tmp3 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer
+ ret <4 x i64> %tmp4
+}
+
+define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_mask_mul_epu32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_mul_epu32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
+ %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
+ %tmp3 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind {
+; X32-LABEL: test_mm_maskz_mul_epu32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_mul_epu32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295>
+ %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295>
+ %tmp2 = mul nuw <2 x i64> %tmp1, %tmp
+ %tmp3 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer
+ ret <2 x i64> %tmp4
+}
+
declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>)
declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
Modified: llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll?rev=329604&r1=329603&r2=329604&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll Mon Apr 9 12:17:38 2018
@@ -1853,22 +1853,27 @@ define i32 @test_mm_movemask_pd(<2 x dou
}
declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
-define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
+define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_mul_epu32:
; X32: # %bb.0:
+; X32-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X32-NEXT: pand %xmm2, %xmm0
+; X32-NEXT: pand %xmm2, %xmm1
; X32-NEXT: pmuludq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_epu32:
; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; X64-NEXT: pand %xmm2, %xmm0
+; X64-NEXT: pand %xmm2, %xmm1
; X64-NEXT: pmuludq %xmm1, %xmm0
; X64-NEXT: retq
- %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
- %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
- %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %arg0, <4 x i32> %arg1)
+ %A = and <2 x i64> %a0, <i64 4294967295, i64 4294967295>
+ %B = and <2 x i64> %a1, <i64 4294967295, i64 4294967295>
+ %res = mul nuw <2 x i64> %A, %B
ret <2 x i64> %res
}
-declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_mul_pd:
Modified: llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll?rev=329604&r1=329603&r2=329604&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll Mon Apr 9 12:17:38 2018
@@ -787,16 +787,34 @@ declare <8 x i16> @llvm.x86.sse41.mpsadb
define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mul_epi32:
; X32: # %bb.0:
+; X32-NEXT: psllq $32, %xmm0
+; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X32-NEXT: psrad $31, %xmm0
+; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; X32-NEXT: psllq $32, %xmm1
+; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X32-NEXT: psrad $31, %xmm1
+; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; X32-NEXT: pmuldq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_epi32:
; X64: # %bb.0:
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT: psrad $31, %xmm0
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; X64-NEXT: psllq $32, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; X64-NEXT: psrad $31, %xmm1
+; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; X64-NEXT: pmuldq %xmm1, %xmm0
; X64-NEXT: retq
- %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
- %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
- %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %arg0, <4 x i32> %arg1)
+ %A = shl <2 x i64> %a0, <i64 32, i64 32>
+ %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32>
+ %B = shl <2 x i64> %a1, <i64 32, i64 32>
+ %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32>
+ %res = mul nsw <2 x i64> %A1, %B1
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
More information about the llvm-commits
mailing list