PATCHES: R600/SI: Remove SelectionDAG operand folding
Matt Arsenault
Matthew.Arsenault at amd.com
Fri Dec 19 08:14:57 PST 2014
On 12/18/2014 08:02 PM, Tom Stellard wrote:
> Hi,
>
> This series of patches removes the legacy operand folding that was done on
> the SelectionDAG. The SIFoldOperands MachineInstr pass now provides the
> same functionality.
>
> -Tom
>
> 0001-R600-SI-Use-immediates-in-the-first-operand-in-fabs-.patch
>
>
> From d6dc4e6bbf378ad86e925982d38297a2622b4ffa Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 18:34:11 -0500
> Subject: [PATCH 1/7] R600/SI: Use immediates in the first operand in fabs/fneg
> patterns
>
> This is for the stand-alone patterns that are lowered using v_or_b32
> and v_xor_b32. Putting the immediate in the first operand ensures
> that it will be folded into the instruction.
I looked at doing this before, and wasn't sure it was the best idea. In
general I think the folding of literals should be smarter considering
the uses literal has. For example, in the test update, an unrolled
vector xor with immediate will now have a copy of the 32-bit immediate for
every vector component. It would probably be smarter to move the
immediate into a register in this case, and re-use for every operand.
It's 8 * N vs. 8 + 4 * (N - 1) bytes for the whole vector to save only 4
cycles, so it ends up being smaller. Maybe this should be an -Os vs. -O3
kind of option. This also applies for SGPR uses.
> ---
> lib/Target/R600/SIInstructions.td | 18 +++++++++---------
> test/CodeGen/R600/fneg-fabs.f64.ll | 26 ++++++++++----------------
> test/CodeGen/R600/fneg-fabs.ll | 26 +++++++++-----------------
> 3 files changed, 28 insertions(+), 42 deletions(-)
>
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 463287e..96f75f9 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -2491,7 +2491,7 @@ def : Pat <
> // FIXME: Should use S_OR_B32
> def : Pat <
> (fneg (fabs f32:$src)),
> - (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
> + (V_OR_B32_e32 (V_MOV_B32_e32 0x80000000), $src) /* Set sign bit */
> >;
>
> // FIXME: Should use S_OR_B32
> @@ -2500,19 +2500,19 @@ def : Pat <
> (REG_SEQUENCE VReg_64,
> (i32 (EXTRACT_SUBREG f64:$src, sub0)),
> sub0,
> - (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
> - (V_MOV_B32_e32 0x80000000)), // Set sign bit.
> + (V_OR_B32_e32 (V_MOV_B32_e32 0x80000000), // Set sign bit.
> + (EXTRACT_SUBREG f64:$src, sub1)),
> sub1)
> >;
>
> def : Pat <
> (fabs f32:$src),
> - (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff))
> + (V_AND_B32_e32 (V_MOV_B32_e32 0x7fffffff), $src)
> >;
>
> def : Pat <
> (fneg f32:$src),
> - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
> + (V_XOR_B32_e32 (V_MOV_B32_e32 0x80000000), $src)
> >;
>
> def : Pat <
> @@ -2520,8 +2520,8 @@ def : Pat <
> (REG_SEQUENCE VReg_64,
> (i32 (EXTRACT_SUBREG f64:$src, sub0)),
> sub0,
> - (V_AND_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
> - (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
> + (V_AND_B32_e32 (V_MOV_B32_e32 0x7fffffff), // Set sign bit.
> + (EXTRACT_SUBREG f64:$src, sub1)),
> sub1)
> >;
>
> @@ -2530,8 +2530,8 @@ def : Pat <
> (REG_SEQUENCE VReg_64,
> (i32 (EXTRACT_SUBREG f64:$src, sub0)),
> sub0,
> - (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
> - (V_MOV_B32_e32 0x80000000)),
> + (V_XOR_B32_e32 (V_MOV_B32_e32 0x80000000),
> + (EXTRACT_SUBREG f64:$src, sub1)),
> sub1)
> >;
>
> diff --git a/test/CodeGen/R600/fneg-fabs.f64.ll b/test/CodeGen/R600/fneg-fabs.f64.ll
> index 555f4cc..6584108 100644
> --- a/test/CodeGen/R600/fneg-fabs.f64.ll
> +++ b/test/CodeGen/R600/fneg-fabs.f64.ll
> @@ -4,8 +4,7 @@
> ; into 2 modifiers, although theoretically that should work.
>
> ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x7fffffff
> -; SI: v_and_b32_e32 v[[FABS:[0-9]+]], {{s[0-9]+}}, [[IMMREG]]
> +; SI: v_and_b32_e32 v[[FABS:[0-9]+]], 0x7fffffff, {{v[0-9]+}}
> ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+}}:[[FABS]]{{\]}}
> define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
> %fabs = call double @llvm.fabs.f64(double %x)
> @@ -45,8 +44,7 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
> }
>
> ; FUNC-LABEL: {{^}}fneg_fabs_fn_free_f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
> %bc = bitcast i64 %in to double
> %fabs = call double @fabs(double %bc)
> @@ -58,8 +56,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
> ; FUNC-LABEL: {{^}}fneg_fabs_f64:
> ; SI: s_load_dwordx2
> ; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
> +; SI: v_mov_b32_e32 [[HI_X_V:v[0-9]+]], s[[HI_X]]
> +; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], 0x80000000, [[HI_X_V]]
> ; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
> ; SI: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
> define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
> @@ -70,10 +68,8 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
> }
>
> ; FUNC-LABEL: {{^}}fneg_fabs_v2f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
> %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
> %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
> @@ -82,12 +78,10 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
> }
>
> ; FUNC-LABEL: {{^}}fneg_fabs_v4f64:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
> %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
> %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
> diff --git a/test/CodeGen/R600/fneg-fabs.ll b/test/CodeGen/R600/fneg-fabs.ll
> index 3cc832f..12cc2a6 100644
> --- a/test/CodeGen/R600/fneg-fabs.ll
> +++ b/test/CodeGen/R600/fneg-fabs.ll
> @@ -33,8 +33,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
> ; R600: |PV.{{[XYZW]}}|
> ; R600: -PV
>
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
> %bc = bitcast i32 %in to float
> %fabs = call float @llvm.fabs.f32(float %bc)
> @@ -48,8 +47,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
> ; R600: |PV.{{[XYZW]}}|
> ; R600: -PV
>
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
> %bc = bitcast i32 %in to float
> %fabs = call float @fabs(float %bc)
> @@ -59,8 +57,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
> }
>
> ; FUNC-LABEL: {{^}}fneg_fabs_f32:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
> %fabs = call float @llvm.fabs.f32(float %in)
> %fsub = fsub float -0.000000e+00, %fabs
> @@ -84,11 +81,8 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
> ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
> ; R600: -PV
>
> -; FIXME: SGPR should be used directly for first src operand.
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
> %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
> %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
> @@ -98,12 +92,10 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
>
> ; FIXME: SGPR should be used directly for first src operand.
> ; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
> -; SI: v_mov_b32_e32 [[IMMREG:v[0-9]+]], 0x80000000
> -; SI-NOT: 0x80000000
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> -; SI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[IMMREG]]
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> +; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
> define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
> %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
> %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
> -- 1.8.5.5
>
> 0002-R600-SI-Make-sure-non-inline-constants-aren-t-folded.patch
>
>
> From 9dacc28f2ff57dafa62c091cb3bc5b995a7c2255 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 18 Dec 2014 16:42:54 -0500
> Subject: [PATCH 2/7] R600/SI: Make sure non-inline constants aren't folded
> into mubuf soffset operand
>
> mubuf instructions now define the soffset field using the SCSrc_32
> register class which indicates that only SGPRs and inline constants
> are allowed.
> ---
> lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp | 5 ++--
> lib/Target/R600/SIInstrInfo.td | 30 ++++++++++++------------
> lib/Target/R600/SIRegisterInfo.cpp | 1 +
> lib/Target/R600/SIRegisterInfo.td | 6 +++++
> test/CodeGen/R600/mubuf.ll | 25 ++++++++++++++++++++
> 5 files changed, 50 insertions(+), 17 deletions(-)
LGTM. I didn't know soffset could be a constant. I think more tests
should be included that use inline immediate values for the constant,
and at the limits of the offset range (those might already be there)
>
>
> 0003-R600-SI-isLegalOperand-shouldn-t-check-constant-bus-.patch
>
>
> From 47155e7618b04c2640f3dc1b2f77cfa6c70f6b2a Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 18:47:07 -0500
> Subject: [PATCH 3/7] R600/SI: isLegalOperand() shouldn't check constant bus
> for SALU instructions
>
> The constant bus restrictions only apply to VALU instructions. This
> enables SIFoldOperands to fold immediates into SALU instructions.
> ---
> lib/Target/R600/SIInstrInfo.cpp | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 08bfc5e..a58a46e 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -1405,7 +1405,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
> if (!MO)
> MO = &MI->getOperand(OpIdx);
>
> - if (usesConstantBus(MRI, *MO)) {
> + if (isVALU(InstDesc.Opcode) && usesConstantBus(MRI, *MO)) {
> unsigned SGPRUsed =
> MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister;
> for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
> -- 1.8.5.5
LGTM
>
> 0004-R600-SI-Refactor-SIFoldOperands-to-simplify-immediat.patch
>
>
> From b25bd5edfda7d9a16682e45ff40faa4411f81a73 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 17 Dec 2014 11:41:55 -0500
> Subject: [PATCH 4/7] R600/SI: Refactor SIFoldOperands to simplify immediate
> folding
>
> This will make a future patch much less intrusive.
> ---
> lib/Target/R600/SIFoldOperands.cpp | 79 ++++++++++++++++++++++++++------------
> 1 file changed, 54 insertions(+), 25 deletions(-)
>
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> index 761e866..b172ee8 100644
> --- a/lib/Target/R600/SIFoldOperands.cpp
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -49,6 +49,23 @@ public:
> }
> };
>
> +struct FoldCandidate {
> + MachineInstr *UseMI;
> + unsigned UseOpNo;
> + MachineOperand *OpToFold;
> + uint64_t ImmToFold;
> +
> + FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
> + UseMI(MI), UseOpNo(OpNo), OpToFold(FoldOp), ImmToFold(0) { }
> +
> + FoldCandidate(MachineInstr *MI, unsigned OpNo, uint64_t Imm) :
> + UseMI(MI), UseOpNo(OpNo), OpToFold(nullptr), ImmToFold(Imm) { }
> +
> + bool IsImm() const {
Should be isImm()
> + return !OpToFold;
> + }
> +};
> +
> } // End anonymous namespace.
>
> INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE,
> @@ -78,30 +95,24 @@ static bool isSafeToFold(unsigned Opcode) {
> }
> }
>
> -static bool updateOperand(MachineInstr *MI, unsigned OpNo,
> - const MachineOperand &New,
> +static bool updateOperand(FoldCandidate &Fold,
> const TargetRegisterInfo &TRI) {
> - MachineOperand &Old = MI->getOperand(OpNo);
> + MachineInstr *MI = Fold.UseMI;
> + MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
> assert(Old.isReg());
>
> - if (New.isImm()) {
> - Old.ChangeToImmediate(New.getImm());
> + if (Fold.IsImm()) {
> + Old.ChangeToImmediate(Fold.ImmToFold);
> return true;
> }
>
> - if (New.isFPImm()) {
> - Old.ChangeToFPImmediate(New.getFPImm());
> + MachineOperand *New = Fold.OpToFold;
> + if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
> + TargetRegisterInfo::isVirtualRegister(New->getReg())) {
> + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
> return true;
> }
>
> - if (New.isReg()) {
> - if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
> - TargetRegisterInfo::isVirtualRegister(New.getReg())) {
> - Old.substVirtReg(New.getReg(), New.getSubReg(), TRI);
> - return true;
> - }
> - }
> -
> // FIXME: Handle physical registers.
>
> return false;
> @@ -133,7 +144,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> OpToFold.getSubReg()))
> continue;
>
> - std::vector<std::pair<MachineInstr *, unsigned>> FoldList;
> + std::vector<FoldCandidate> FoldList;
> for (MachineRegisterInfo::use_iterator
> Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end();
> Use != E; ++Use) {
> @@ -146,10 +157,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> continue;
> }
>
> + bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
> +
Have you tried replacing all FP immediates with integers instead?
> // In order to fold immediates into copies, we need to change the
> // copy to a MOV.
> - if ((OpToFold.isImm() || OpToFold.isFPImm()) &&
> - UseMI->getOpcode() == AMDGPU::COPY) {
> + if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
> const TargetRegisterClass *TRC =
> MRI.getRegClass(UseMI->getOperand(0).getReg());
>
> @@ -173,9 +185,24 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
> continue;
>
> - // Normal substitution
> + if (FoldingImm) {
> + uint64_t Imm;
> + if (OpToFold.isFPImm()) {
> + Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
> + } else {
> + Imm = OpToFold.getImm();
> + }
> +
> + const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
> + if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
> + FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
> + continue;
> + }
> + }
> +
> + // Normal substitution with registers
> if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &OpToFold)) {
> - FoldList.push_back(std::make_pair(UseMI, Use.getOperandNo()));
> + FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), &OpToFold));
> continue;
> }
>
> @@ -187,13 +214,15 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> // already does this.
> }
>
> - for (std::pair<MachineInstr *, unsigned> Fold : FoldList) {
> - if (updateOperand(Fold.first, Fold.second, OpToFold, TRI)) {
> + for (FoldCandidate &Fold : FoldList) {
> + if (updateOperand(Fold, TRI)) {
> // Clear kill flags.
> - if (OpToFold.isReg())
> - OpToFold.setIsKill(false);
> + if (!Fold.IsImm()) {
> + assert(Fold.OpToFold && Fold.OpToFold->isReg());
> + Fold.OpToFold->setIsKill(false);
> + }
> DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
> - Fold.second << " of " << *Fold.first << '\n');
> + Fold.UseOpNo << " of " << *Fold.UseMI << '\n');
> }
> }
> }
> -- 1.8.5.5
>
> 0005-R600-SI-Teach-SIFoldOperands-to-split-64-bit-constan.patch
>
>
> From a8f3e690476d07a0b87adc624c690a6a68b22736 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 18:49:19 -0500
> Subject: [PATCH 5/7] R600/SI: Teach SIFoldOperands to split 64-bit constants
> when folding
>
> This allows folding of sequences like:
>
> s[0:1] = s_mov_b64 4
> v_add_i32 v0, s0, v0
> v_addc_u32 v1, s1, v1
>
> into
>
> v_add_i32 v0, 4, v0
> v_add_i32 v1, 0, v1
> ---
> lib/Target/R600/SIFoldOperands.cpp | 69 +++++++++++++++++++++++-------------
> test/CodeGen/R600/operand-folding.ll | 17 +++++++++
> test/CodeGen/R600/sint_to_fp.f64.ll | 8 ++---
> test/CodeGen/R600/uint_to_fp.f64.ll | 8 ++---
> 4 files changed, 69 insertions(+), 33 deletions(-)
>
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> index b172ee8..c661b0a 100644
> --- a/lib/Target/R600/SIFoldOperands.cpp
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -153,27 +153,51 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo());
>
> // FIXME: Fold operands with subregs.
> - if (UseOp.isReg() && UseOp.getSubReg()) {
> + if (UseOp.isReg() && UseOp.getSubReg() && OpToFold.isReg()) {
> continue;
> }
>
> bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
> + APInt Imm;
>
> - // In order to fold immediates into copies, we need to change the
> - // copy to a MOV.
> - if (FoldingImm && UseMI->getOpcode() == AMDGPU::COPY) {
> - const TargetRegisterClass *TRC =
> - MRI.getRegClass(UseMI->getOperand(0).getReg());
> -
> - if (TRC->getSize() == 4) {
> - if (TRI.isSGPRClass(TRC))
> - UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
> - else
> - UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
> - } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
> - UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
> + if (FoldingImm) {
> + const TargetRegisterClass *UseRC = MRI.getRegClass(UseOp.getReg());
> +
> + if (OpToFold.isFPImm()) {
> + Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt();
> } else {
> - continue;
> + Imm = APInt(64, OpToFold.getImm());
> + }
> +
> + // Split 64-bit constants into 32-bits for folding.
> + if (UseOp.getSubReg()) {
> + if (UseRC->getSize() != 8)
> + continue;
> +
> + if (UseOp.getSubReg() == AMDGPU::sub0) {
> + Imm = Imm.getLoBits(32);
> + } else {
> + assert(UseOp.getSubReg() == AMDGPU::sub1);
> + Imm = Imm.getHiBits(32);
> + }
> + }
> +
> + // In order to fold immediates into copies, we need to change the
> + // copy to a MOV.
> + if (UseMI->getOpcode() == AMDGPU::COPY) {
> + const TargetRegisterClass *TRC =
> + MRI.getRegClass(UseMI->getOperand(0).getReg());
> +
> + if (TRC->getSize() == 4) {
> + if (TRI.isSGPRClass(TRC))
> + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
> + else
> + UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
> + } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
> + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
> + } else {
> + continue;
> + }
It would probably be useful to factor this into a getMovOpcode() somewhere
> }
> }
>
> @@ -185,19 +209,14 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
> continue;
>
> - if (FoldingImm) {
> - uint64_t Imm;
> - if (OpToFold.isFPImm()) {
> - Imm = OpToFold.getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
> - } else {
> - Imm = OpToFold.getImm();
> - }
>
> - const MachineOperand ImmOp = MachineOperand::CreateImm(Imm);
> + if (FoldingImm) {
> + const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
> if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
> - FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), Imm));
> - continue;
> + FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(),
> + Imm.getSExtValue()));
> }
> + continue;
> }
>
> // Normal substitution with registers
> diff --git a/test/CodeGen/R600/operand-folding.ll b/test/CodeGen/R600/operand-folding.ll
> index 05177b4..f62aa09 100644
> --- a/test/CodeGen/R600/operand-folding.ll
> +++ b/test/CodeGen/R600/operand-folding.ll
> @@ -36,5 +36,22 @@ endif:
> ret void
> }
>
> +; CHECK-LABEL: {{^}}fold_64bit_constant_add:
> +; CHECK-NOT: s_mov_b64
> +; FIXME: It would be better if we clud use v_add here and drop the extra
Typo 'clud'
> +; v_mov_b32 instructions.
> +; CHECK-DAG: s_add_u32 [[LO:s[0-9]+]], s{{[0-9]+}}, 1
> +; CHECK-DAG: s_addc_u32 [[HI:s[0-9]+]], s{{[0-9]+}}, 0
> +; CHECK-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[LO]]
> +; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
> +; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
> +
> +define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
> +entry:
> + %tmp0 = add i64 %val, 1
> + store i64 %tmp0, i64 addrspace(1)* %out
> + ret void
> +}
> +
One that will use vector adds would be useful if one isn't there already
> declare i32 @llvm.r600.read.tidig.x() #0
> attributes #0 = { readnone }
> diff --git a/test/CodeGen/R600/sint_to_fp.f64.ll b/test/CodeGen/R600/sint_to_fp.f64.ll
> index 6e4f87c..efbdf25 100644
> --- a/test/CodeGen/R600/sint_to_fp.f64.ll
> +++ b/test/CodeGen/R600/sint_to_fp.f64.ll
> @@ -12,10 +12,10 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
>
> ; SI-LABEL: {{^}}sint_to_fp_i1_f64:
> ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
> -; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
> -; we should be able to fold the SGPRs into the V_CNDMASK instructions.
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
> +; uses an SGPR for [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
> ; SI: buffer_store_dwordx2
> ; SI: s_endpgm
> define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
> diff --git a/test/CodeGen/R600/uint_to_fp.f64.ll b/test/CodeGen/R600/uint_to_fp.f64.ll
> index d16872b..fa70bdf 100644
> --- a/test/CodeGen/R600/uint_to_fp.f64.ll
> +++ b/test/CodeGen/R600/uint_to_fp.f64.ll
> @@ -72,10 +72,10 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
>
> ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64:
> ; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
> -; FIXME: We should the VGPR sources for V_CNDMASK are copied from SGPRs,
> -; we should be able to fold the SGPRs into the V_CNDMASK instructions.
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CMP]]
> +; We can't fold the SGPRs into v_cndmask_b32_e64, because it already
> +; uses an SGPR for [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
> +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]]
> ; SI: buffer_store_dwordx2
> ; SI: s_endpgm
> define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
> -- 1.8.5.5
>
> 0006-R600-SI-Add-a-V_MOV_B64-pseudo-instruction.patch
>
>
> From 8ff850ac46ab082a06882e743946297c2791d6e3 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 17 Dec 2014 16:35:19 -0500
> Subject: [PATCH 6/7] R600/SI: Add a V_MOV_B64 pseudo instruction
>
> This is used to simplify the SIFoldOperands pass and make it easier to
> fold immediates.
> ---
> lib/Target/R600/SIFoldOperands.cpp | 3 +++
> lib/Target/R600/SIInstrInfo.cpp | 29 +++++++++++++++++++++++++++
> lib/Target/R600/SIInstructions.td | 6 ++++++
> test/CodeGen/R600/atomic_cmp_swap_local.ll | 10 ++++------
> test/CodeGen/R600/imm.ll | 7 ++-----
> test/CodeGen/R600/local-atomics64.ll | 32 ++++++++++++------------------
> 6 files changed, 57 insertions(+), 30 deletions(-)
I thought you were avoiding this with the previous patch? Does the pass
handle constants specially and this is for SGPRs only?
I think the name might need something to indicate it's a
pseudoinstruction. v_mov_b64 sounds like a reasonable real instruction
name which I wish would be added which might confuse people who aren't
aware it doesn't exist.
>
> diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp
> index c661b0a..1b0e09e 100644
> --- a/lib/Target/R600/SIFoldOperands.cpp
> +++ b/lib/Target/R600/SIFoldOperands.cpp
> @@ -86,6 +86,7 @@ static bool isSafeToFold(unsigned Opcode) {
> switch(Opcode) {
> case AMDGPU::V_MOV_B32_e32:
> case AMDGPU::V_MOV_B32_e64:
> + case AMDGPU::V_MOV_B64:
> case AMDGPU::S_MOV_B32:
> case AMDGPU::S_MOV_B64:
> case AMDGPU::COPY:
> @@ -195,6 +196,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
> UseMI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
> } else if (TRC->getSize() == 8 && TRI.isSGPRClass(TRC)) {
> UseMI->setDesc(TII->get(AMDGPU::S_MOV_B64));
> + } else if (TRC->getSize() == 8 && !TRI.isSGPRClass(TRC)) {
> + UseMI->setDesc(TII->get(AMDGPU::V_MOV_B64));
> } else {
> continue;
> }
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index a58a46e..fe29a50 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -662,6 +662,35 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
> // This is just a placeholder for register allocation.
> MI->eraseFromParent();
> break;
> +
> + case AMDGPU::V_MOV_B64: {
> + unsigned Dst = MI->getOperand(0).getReg();
> + unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
> + unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
> +
> + const MachineOperand &SrcOp = MI->getOperand(1);
> + // FIXME: Will this work for 64-bit floating point immediates?
> + assert(!SrcOp.isFPImm());
> + if (SrcOp.isImm()) {
> + APInt Imm(64, SrcOp.getImm());
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
> + .addImm(Imm.getLoBits(32).getZExtValue())
> + .addReg(Dst, RegState::Implicit);
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
> + .addImm(Imm.getHiBits(32).getZExtValue())
> + .addReg(Dst, RegState::Implicit);
> + } else {
> + assert(SrcOp.isReg());
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
> + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
> + .addReg(Dst, RegState::Implicit);
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
> + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
> + .addReg(Dst, RegState::Implicit);
> + }
> + MI->eraseFromParent();
> + break;
> + }
> }
> return true;
> }
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index 96f75f9..dbc090d 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1742,6 +1742,12 @@ defm V_TRIG_PREOP_F64 : VOP3Inst <
> //===----------------------------------------------------------------------===//
> let isCodeGenOnly = 1, isPseudo = 1 in {
>
> +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
> +// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
> +// pass to enable folding of inline immediates.
> +def V_MOV_B64 : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>;
> +} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0
> +
> let hasSideEffects = 1 in {
> def SGPR_USE : InstSI <(outs),(ins), "", []>;
> }
> diff --git a/test/CodeGen/R600/atomic_cmp_swap_local.ll b/test/CodeGen/R600/atomic_cmp_swap_local.ll
> index 223f4d3..a77627c 100644
> --- a/test/CodeGen/R600/atomic_cmp_swap_local.ll
> +++ b/test/CodeGen/R600/atomic_cmp_swap_local.ll
> @@ -20,9 +20,8 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
> ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
> ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
> ; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
> -; SI: s_mov_b64 s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
> -; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
> +; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
> +; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
> ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> ; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
> ; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
> @@ -69,9 +68,8 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
> ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_noret_i64_offset:
> ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
> ; SI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
> -; SI: s_mov_b64 s{{\[}}[[LOSCMP:[0-9]+]]:[[HISCMP:[0-9]+]]{{\]}}, 7
> -; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], s[[LOSCMP]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], s[[HISCMP]]
> +; SI-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
> +; SI-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
> ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> ; SI-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
> ; SI-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
> diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
> index 79f36b6..1cc03b8 100644
> --- a/test/CodeGen/R600/imm.ll
> +++ b/test/CodeGen/R600/imm.ll
> @@ -474,12 +474,9 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
> }
>
>
> -; FIXME: These shoudn't bother materializing in SGPRs
> -
> ; CHECK-LABEL: {{^}}store_inline_imm_0.0_f64
> -; CHECK: s_mov_b64 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]{{\]}}, 0{{$}}
> -; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
> -; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
> +; CHECK: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
> +; CHECK: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0
> ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
> define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
> store double 0.0, double addrspace(1)* %out
> diff --git a/test/CodeGen/R600/local-atomics64.ll b/test/CodeGen/R600/local-atomics64.ll
> index ce0cf59..b39581e 100644
> --- a/test/CodeGen/R600/local-atomics64.ll
> +++ b/test/CodeGen/R600/local-atomics64.ll
> @@ -30,9 +30,8 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
>
> ; FUNC-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
> ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
> +; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
> ; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> ; SI: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
> ; SI: buffer_store_dwordx2 [[RESULT]],
> @@ -45,9 +44,8 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
> }
>
> ; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
> ; SI: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
> ; SI: buffer_store_dwordx2 [[RESULT]],
> ; SI: s_endpgm
> @@ -87,9 +85,8 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
> }
>
> ; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
> ; SI: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
> ; SI: buffer_store_dwordx2 [[RESULT]],
> ; SI: s_endpgm
> @@ -277,10 +274,9 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
>
> ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i64_offset:
> ; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, 9
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> -; SI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> +; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
> ; SI: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32 [M0]
> ; SI: s_endpgm
> define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
> @@ -290,9 +286,8 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
> }
>
> ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
> ; SI: ds_inc_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
> ; SI: s_endpgm
> define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
> @@ -327,9 +322,8 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
> }
>
> ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64:
> -; SI: s_mov_b64 s{{\[}}[[LOSDATA:[0-9]+]]:[[HISDATA:[0-9]+]]{{\]}}, -1
> -; SI-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], s[[LOSDATA]]
> -; SI-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], s[[HISDATA]]
> +; SI: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], -1
> +; SI: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], -1
> ; SI: ds_dec_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
> ; SI: s_endpgm
> define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
> -- 1.8.5.5
>
> 0007-R600-SI-Remove-SIISelLowering-legalizeOperands.patch
>
>
> From dd174475c08471d665d33c6dbe665dafb35fe47c Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Thu, 11 Dec 2014 19:05:21 -0500
> Subject: [PATCH 7/7] R600/SI: Remove SIISelLowering::legalizeOperands()
>
> Its functionality has been replaced by calling
> SIInstrInfo::legalizeOperands() from
> SIISelLowering::AdjstInstrPostInstrSelection() and running the
> SIFoldOperands and SIShrinkInstructions passes.
> ---
> lib/Target/R600/SIISelLowering.cpp | 174 +-----------------------------------
> lib/Target/R600/SIISelLowering.h | 3 -
> test/CodeGen/R600/fneg.ll | 2 +-
> test/CodeGen/R600/imm.ll | 4 +-
> test/CodeGen/R600/seto.ll | 2 +-
> test/CodeGen/R600/setuo.ll | 2 +-
> test/CodeGen/R600/sint_to_fp.f64.ll | 4 +-
> test/CodeGen/R600/sint_to_fp.ll | 2 +-
> test/CodeGen/R600/uint_to_fp.f64.ll | 4 +-
> test/CodeGen/R600/uint_to_fp.ll | 2 +-
> test/CodeGen/R600/xor.ll | 2 +-
> 11 files changed, 13 insertions(+), 188 deletions(-)
LGTM
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141219/30ab065d/attachment.html>
More information about the llvm-commits
mailing list