[llvm] r288703 - [X86] Fix non-intrinsic roundss/roundsd to not read the destination register
Michael Kuperstein via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 5 12:57:38 PST 2016
Author: mkuper
Date: Mon Dec 5 14:57:37 2016
New Revision: 288703
URL: http://llvm.org/viewvc/llvm-project?rev=288703&view=rev
Log:
[X86] Fix non-intrinsic roundss/roundsd to not read the destination register
This changes the scalar non-intrinsic non-avx roundss/sd instruction
definitions not to read their destination register - allowing partial dependency
breaking.
This fixes PR31143.
Differential Revision: https://reviews.llvm.org/D27323
Added:
llvm/trunk/test/CodeGen/X86/pr31143.ll
Modified:
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
llvm/trunk/lib/Target/X86/X86InstrSSE.td
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=288703&r1=288702&r2=288703&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon Dec 5 14:57:37 2016
@@ -592,6 +592,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
+ { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
+ { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
@@ -1212,8 +1214,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
- { X86::ROUNDSDr, X86::ROUNDSDm, 0 },
- { X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
{ X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
{ X86::SBB32rr, X86::SBB32rm, 0 },
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=288703&r1=288702&r2=288703&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Dec 5 14:57:37 2016
@@ -6268,10 +6268,10 @@ let Predicates = [UseAVX] in {
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//
-multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
- X86MemOperand x86memop, RegisterClass RC,
- PatFrag mem_frag32, PatFrag mem_frag64,
- Intrinsic V4F32Int, Intrinsic V2F64Int> {
+multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ PatFrag mem_frag32, PatFrag mem_frag64,
+ Intrinsic V4F32Int, Intrinsic V2F64Int> {
let ExeDomain = SSEPackedSingle in {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
@@ -6312,35 +6312,73 @@ let ExeDomain = SSEPackedDouble in {
} // ExeDomain = SSEPackedDouble
}
-multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr,
- Intrinsic F32Int,
- Intrinsic F64Int, bit Is2Addr = 1> {
-let ExeDomain = GenericDomain in {
- // Operation, reg.
- let hasSideEffects = 0 in
+multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
- (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAdd]>;
- // Operation, mem.
- let mayLoad = 1, hasSideEffects = 0 in
+ let mayLoad = 1 in
def SSm : SS4AIi8<opcss, MRMSrcMem,
(outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
- // Intrinsic operation, reg.
- let isCodeGenOnly = 1 in
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr> {
+let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+ def SSr : SS4AIi8<opcss, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SSm : SS4AIi8<opcss, MRMSrcMem,
+ (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+
+ def SDr : SS4AIi8<opcsd, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAdd]>;
+
+ let mayLoad = 1 in
+ def SDm : SS4AIi8<opcsd, MRMSrcMem,
+ (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
+ !strconcat(OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = GenericDomain, hasSideEffects = 0
+}
+
+multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6351,8 +6389,6 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;
- // Intrinsic operation, mem.
- let isCodeGenOnly = 1 in
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6364,30 +6400,6 @@ let ExeDomain = GenericDomain in {
(F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
- // Operation, reg.
- let hasSideEffects = 0 in
- def SDr : SS4AIi8<opcsd, MRMSrcReg,
- (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- []>, Sched<[WriteFAdd]>;
-
- // Operation, mem.
- let mayLoad = 1, hasSideEffects = 0 in
- def SDm : SS4AIi8<opcsd, MRMSrcMem,
- (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
- !if(Is2Addr,
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !strconcat(OpcodeStr,
- "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- []>, Sched<[WriteFAddLd, ReadAfterLd]>;
-
- // Intrinsic operation, reg.
- let isCodeGenOnly = 1 in
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6398,8 +6410,6 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
Sched<[WriteFAdd]>;
- // Intrinsic operation, mem.
- let isCodeGenOnly = 1 in
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6410,23 +6420,24 @@ let ExeDomain = GenericDomain in {
[(set VR128:$dst,
(F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain
+} // ExeDomain = GenericDomain, isCodeGenOnly = 1
}
// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX] in {
// Intrinsic form
- defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
- loadv4f32, loadv2f64,
- int_x86_sse41_round_ps,
- int_x86_sse41_round_pd>, VEX;
- defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
- loadv8f32, loadv4f64,
- int_x86_avx_round_ps_256,
- int_x86_avx_round_pd_256>, VEX, VEX_L;
- defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
- int_x86_sse41_round_ss,
- int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
+ loadv4f32, loadv2f64,
+ int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>, VEX;
+ defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
+ loadv8f32, loadv4f64,
+ int_x86_avx_round_ps_256,
+ int_x86_avx_round_pd_256>, VEX, VEX_L;
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
+ int_x86_sse41_round_ss,
+ int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
}
let Predicates = [UseAVX] in {
@@ -6498,34 +6509,37 @@ let Predicates = [HasAVX] in {
(VROUNDYPDr VR256:$src, (i32 0xB))>;
}
-defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
- memopv4f32, memopv2f64,
- int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
+ memopv4f32, memopv2f64, int_x86_sse41_round_ps,
+ int_x86_sse41_round_pd>;
+
+defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
+
let Constraints = "$src1 = $dst" in
-defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
+ (ROUNDSSr FR32:$src, (i32 0x9))>;
def : Pat<(f64 (ffloor FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
+ (ROUNDSDr FR64:$src, (i32 0x9))>;
def : Pat<(f32 (fnearbyint FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+ (ROUNDSSr FR32:$src, (i32 0xC))>;
def : Pat<(f64 (fnearbyint FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+ (ROUNDSDr FR64:$src, (i32 0xC))>;
def : Pat<(f32 (fceil FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
+ (ROUNDSSr FR32:$src, (i32 0xA))>;
def : Pat<(f64 (fceil FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
+ (ROUNDSDr FR64:$src, (i32 0xA))>;
def : Pat<(f32 (frint FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+ (ROUNDSSr FR32:$src, (i32 0x4))>;
def : Pat<(f64 (frint FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+ (ROUNDSDr FR64:$src, (i32 0x4))>;
def : Pat<(f32 (ftrunc FR32:$src)),
- (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
+ (ROUNDSSr FR32:$src, (i32 0xB))>;
def : Pat<(f64 (ftrunc FR64:$src)),
- (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
+ (ROUNDSDr FR64:$src, (i32 0xB))>;
def : Pat<(v4f32 (ffloor VR128:$src)),
(ROUNDPSr VR128:$src, (i32 0x9))>;
Added: llvm/trunk/test/CodeGen/X86/pr31143.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr31143.ll?rev=288703&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr31143.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pr31143.ll Mon Dec 5 14:57:37 2016
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -mattr=+sse4.2 < %s | FileCheck %s
+
+; CHECK-LABEL: testss:
+; CHECK: movss {{.*}}, %[[XMM0:xmm[0-9]+]]
+; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]]
+; CHECK: roundss $9, %[[XMM0]], %[[XMM1]]
+
+define void @testss(float* nocapture %a, <4 x float>* nocapture %b, i32 %k) {
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+ %v = load float, float* %arrayidx, align 4
+ %floor = call float @floorf(float %v)
+ %sub = fsub float %floor, %v
+ %v1 = insertelement <4 x float> undef, float %sub, i32 0
+ %br = shufflevector <4 x float> %v1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ store volatile <4 x float> %br, <4 x float>* %b, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %k
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; CHECK-LABEL: testsd:
+; CHECK: movsd {{.*}}, %[[XMM0:xmm[0-9]+]]
+; CHECK: xorps %[[XMM1:xmm[0-9]+]], %[[XMM1]]
+; CHECK: roundsd $9, %[[XMM0]], %[[XMM1]]
+
+define void @testsd(double* nocapture %a, <2 x double>* nocapture %b, i32 %k) {
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+ %v = load double, double* %arrayidx, align 4
+ %floor = call double @floor(double %v)
+ %sub = fsub double %floor, %v
+ %v1 = insertelement <2 x double> undef, double %sub, i32 0
+ %br = shufflevector <2 x double> %v1, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ store volatile <2 x double> %br, <2 x double>* %b, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %k
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+declare float @floorf(float) nounwind readnone
+
+declare double @floor(double) nounwind readnone
+
More information about the llvm-commits
mailing list