[llvm] d5fa8b1 - [X86] Support SAE for VCVTPS2PH from intrinsic.
Freddy Ye via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 5 20:28:35 PDT 2022
Author: Freddy Ye
Date: 2022-09-06T11:28:12+08:00
New Revision: d5fa8b1c2c3f604d225fbe369b3c4c867473c030
URL: https://github.com/llvm/llvm-project/commit/d5fa8b1c2c3f604d225fbe369b3c4c867473c030
DIFF: https://github.com/llvm/llvm-project/commit/d5fa8b1c2c3f604d225fbe369b3c4c867473c030.diff
LOG: [X86] Support SAE for VCVTPS2PH from intrinsic.
For now, clang and gcc both failed to generate sae version from _mm512_cvt_roundps_ph:
https://godbolt.org/z/oh7eTGY5z. Intrinsic guide description is also wrong, which will be
update soon.
Reviewed By: pengfei
Differential Revision: https://reviews.llvm.org/D132641
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/lib/Target/X86/X86ISelLowering.h
llvm/lib/Target/X86/X86InstrAVX512.td
llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
llvm/test/CodeGen/X86/avx512-intrinsics.ll
llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ba558f40c033e..389207f8d451b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -27152,15 +27152,26 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+ unsigned RC = 0;
+ unsigned Opc = IntrData->Opc0;
+ bool SAE = Src.getValueType().is512BitVector() &&
+ (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
+ if (SAE) {
+ Opc = X86ISD::CVTPS2PH_SAE;
+ Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
+ }
+
if (isAllOnesConstant(Mask))
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
+ return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
+ if (SAE)
+ Opc = X86ISD::MCVTPS2PH_SAE;
+ else
+ Opc = IntrData->Opc1;
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
- PassThru, Mask);
-
+ return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
}
case CVTNEPS2BF16_MASK: {
SDValue Src = Op.getOperand(1);
@@ -33841,7 +33852,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
NODE_NAME_CASE(CVTPS2PH)
NODE_NAME_CASE(STRICT_CVTPS2PH)
+ NODE_NAME_CASE(CVTPS2PH_SAE)
NODE_NAME_CASE(MCVTPS2PH)
+ NODE_NAME_CASE(MCVTPS2PH_SAE)
NODE_NAME_CASE(CVTPH2PS)
NODE_NAME_CASE(STRICT_CVTPH2PS)
NODE_NAME_CASE(CVTPH2PS_SAE)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d7cdf431871f6..d056f53ecd21f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -703,12 +703,14 @@ namespace llvm {
// Conversions between float and half-float.
CVTPS2PH,
+ CVTPS2PH_SAE,
CVTPH2PS,
CVTPH2PS_SAE,
// Masked version of above.
// SRC, RND, PASSTHRU, MASK
MCVTPS2PH,
+ MCVTPS2PH_SAE,
// Galois Field Arithmetic Instructions
GF2P8AFFINEINVQB,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index c105bde78ad1f..788bed3eb6896 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -9181,12 +9181,29 @@ let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
SchedWrite Sched> {
- let hasSideEffects = 0, Uses = [MXCSR] in
- defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
- (outs _dest.RC:$dst),
- (ins _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>,
- EVEX_B, AVX512AIi8Base, Sched<[Sched]>;
+ let hasSideEffects = 0, Uses = [MXCSR] in {
+ def rrb : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, {sae}, $src1, $dst|$dst, $src1, {sae}, $src2}",
+ [(set _dest.RC:$dst,
+ (X86cvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
+ EVEX_B, Sched<[Sched]>;
+ let Constraints = "$src0 = $dst" in
+ def rrbk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, {sae}, $src1, $dst {${mask}}|$dst {${mask}}, $src1, {sae}, $src2}",
+ [(set _dest.RC:$dst,
+ (X86mcvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2),
+ _dest.RC:$src0, _src.KRCWM:$mask))]>,
+ EVEX_B, Sched<[Sched]>, EVEX_K;
+ def rrbkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, {sae}, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, {sae}, $src2}",
+ [(set _dest.RC:$dst,
+ (X86mcvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2),
+ _dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
+ EVEX_B, Sched<[Sched]>, EVEX_KZ;
+}
}
let Predicates = [HasAVX512] in {
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 57ba4683c6a40..f35294da45f05 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -762,13 +762,17 @@ def X86any_cvtps2ph : PatFrags<(ops node:$src1, node:$src2),
[(X86strict_cvtps2ph node:$src1, node:$src2),
(X86cvtps2ph node:$src1, node:$src2)]>;
-def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH",
- SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>,
- SDTCVecEltisVT<1, f32>,
- SDTCisVT<2, i32>,
- SDTCisSameAs<0, 3>,
- SDTCVecEltisVT<4, i1>,
- SDTCisSameNumEltsAs<1, 4>]> >;
+def X86cvtps2phSAE : SDNode<"X86ISD::CVTPS2PH_SAE", SDTcvtps2ph>;
+
+def SDTmcvtps2ph : SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisVT<2, i32>,
+ SDTCisSameAs<0, 3>,
+ SDTCVecEltisVT<4, i1>,
+ SDTCisSameNumEltsAs<1, 4>]>;
+def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", SDTmcvtps2ph>;
+def X86mcvtps2phSAE : SDNode<"X86ISD::MCVTPS2PH_SAE", SDTmcvtps2ph>;
+
def X86vfpextSAE : SDNode<"X86ISD::VFPEXT_SAE",
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
SDTCisFP<1>, SDTCisVec<1>,
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 8455121b2b222..8e0943a10537b 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1011,8 +1011,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; X64-LABEL: test_x86_vcvtps2ph_256:
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
-; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
-; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
+; X64-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
+; X64-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
; X64-NEXT: vmovdqa %ymm1, %ymm0
@@ -1022,15 +1022,15 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
-; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
+; X86-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
+; X86-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax)
; X86-NEXT: vmovdqa %ymm1, %ymm0
; X86-NEXT: retl
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
- %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask)
store <16 x i16> %res1, ptr %dst
%res = add <16 x i16> %res2, %res3
ret <16 x i16> %res
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 9e52b33b60c69..468e02d8884ad 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -4356,8 +4356,8 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
-; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02]
-; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
+; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
@@ -4366,14 +4366,14 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
-; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02]
-; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
+; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 11, <8 x i16> %src, i8 %mask)
%res0 = add <8 x i16> %res1, %res2
%res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
@@ -4387,8 +4387,8 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
-; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02]
-; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
+; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -4398,15 +4398,15 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
-; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02]
-; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
+; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 11, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 12, <8 x i16> %src, i8 %mask)
%res0 = add <8 x i16> %res1, %res2
%res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
More information about the llvm-commits
mailing list