[PATCH] R600/SI: Fixes for 64-bit operations only on SALU
Matt Arsenault
Matthew.Arsenault at amd.com
Mon Mar 24 13:14:59 PDT 2014
On 03/24/2014 12:54 PM, Tom Stellard wrote:
> Hi Matt,
>
> I just tested these and there are no regressions. Go ahead and commit.
>
> -Tom
>
> On Mon, Mar 24, 2014 at 11:13:57AM -0700, Matt Arsenault wrote:
>> On 03/24/2014 08:01 AM, Tom Stellard wrote:
>>> On Fri, Mar 21, 2014 at 08:28:41PM -0700, Matt Arsenault wrote:
>>>> On 03/18/2014 03:10 AM, Matt Arsenault wrote:
>>>>> Hi,
>>>>>
>>>>> These patches move more instruction patterns to use initially
>>>>> select the scalar version to be more consistent with the other
>>>>> instructions. Also fixes a few 64-bit operations that the SALU has
>>>>> that are not available for the VALU.
>>>>>
>>>>> _______________________________________________
>>>>> llvm-commits mailing list
>>>>> llvm-commits at cs.uiuc.edu
>>>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>>> Updated patches
>>>>
>>>> From 3926aa34bd988e0560f878743c8e55d454633969 Mon Sep 17 00:00:00 2001
>>>> From: Matt Arsenault <arsenm2 at gmail.com>
>>>> Date: Fri, 21 Mar 2014 14:28:57 -0700
>>>> Subject: [PATCH 1/4] R600/SI: Move splitting 64-bit immediates to separate
>>>> function.
>>>>
>>>> ---
>>>> lib/Target/R600/SIInstrInfo.cpp | 92 ++++++++++++++++++++++++-----------------
>>>> lib/Target/R600/SIInstrInfo.h | 6 +++
>>>> 2 files changed, 59 insertions(+), 39 deletions(-)
>>>>
>>> Patch 1: LGTM.
>>>
>>>> From 63d3ec264e2539f172460e82cdab3415029bbdd3 Mon Sep 17 00:00:00 2001
>>>> From: Matt Arsenault <arsenm2 at gmail.com>
>>>> Date: Tue, 18 Mar 2014 02:29:55 -0700
>>>> Subject: [PATCH 2/4] R600/SI: Fix 64-bit bit ops that require the VALU.
>>>>
>>>> Try to match scalar and first like the other instructions.
>>>> Expand 64-bit ands to a pair of 32-bit ands since that is not
>>>> available on the VALU.
>>>> ---
>>>> lib/Target/R600/SIInstrInfo.cpp | 86 +++++++++++++++++++++++++++++++++++++++
>>>> lib/Target/R600/SIInstrInfo.h | 10 +++++
>>>> lib/Target/R600/SIInstructions.td | 2 +-
>>>> test/CodeGen/R600/or.ll | 33 +++++++++++----
>>>> 4 files changed, 123 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
>>>> index 2a66e17..ba552c5 100644
>>>> --- a/lib/Target/R600/SIInstrInfo.cpp
>>>> +++ b/lib/Target/R600/SIInstrInfo.cpp
>>>> @@ -519,6 +519,16 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
>>>> }
>>>> }
>>>> +// There is no 64-bit vector and available, so a direct conversion is not
>>>> +// possible. We need to split this into 2 32-bit ands.
>>>> +bool SIInstrInfo::isIllegalVALU64BitOp(unsigned Opcode) {
>>>> + switch (Opcode) {
>>>> + return true;
>>>> + default:
>>>> + return false;
>>>> + }
>>> This switch statement doesn't look right.
>> This is leftover from the original version, it isn't used anymore.
>>
>>>> +}
>>>> +
>>>> bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
>>>> return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
>>>> }
>>>> @@ -879,6 +889,30 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
>>>> Inst->eraseFromParent();
>>>> continue;
>>>> }
>>>> + case AMDGPU::S_AND_B64:
>>>> + splitScalar64BitOp(Worklist, Inst, AMDGPU::S_AND_B32);
>>>> + Inst->eraseFromParent();
>>>> + continue;
>>>> +
>>>> + case AMDGPU::S_OR_B64:
>>>> + splitScalar64BitOp(Worklist, Inst, AMDGPU::S_OR_B32);
>>>> + Inst->eraseFromParent();
>>>> + continue;
>>>> +
>>>> + case AMDGPU::S_XOR_B64:
>>>> + splitScalar64BitOp(Worklist, Inst, AMDGPU::S_XOR_B32);
>>>> + Inst->eraseFromParent();
>>>> + continue;
>>>> +
>>>> + case AMDGPU::S_NOT_B64:
>>>> + splitScalar64BitOp(Worklist, Inst, AMDGPU::S_NOT_B32);
>>>> + Inst->eraseFromParent();
>>>> + continue;
>>>> +
>>>> + case AMDGPU::S_BFE_U64:
>>>> + case AMDGPU::S_BFE_I64:
>>>> + case AMDGPU::S_BFM_B64:
>>>> + llvm_unreachable("Moving this op to VALU not implemented");
>>>> }
>>>> unsigned NewOpcode = getVALUOp(*Inst);
>>>> @@ -968,6 +1002,58 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
>>>> return &AMDGPU::VReg_32RegClass;
>>>> }
>>>> +void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> + MachineInstr *Inst,
>>>> + unsigned Opcode) const {
>>>> + MachineBasicBlock &MBB = *Inst->getParent();
>>>> + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
>>>> +
>>>> + // We shouldn't need to worry about immediate operands here.
>>> Originally, I wondered why not, but I see you fix this in the next patch.
>>>
>>>> + MachineOperand &Dest = Inst->getOperand(0);
>>>> + MachineOperand &Src0 = Inst->getOperand(1);
>>>> + MachineOperand &Src1 = Inst->getOperand(2);
>>>> + DebugLoc DL = Inst->getDebugLoc();
>>>> +
>>>> + MachineBasicBlock::iterator MII = Inst;
>>>> +
>>>> + const MCInstrDesc &InstDesc = get(Opcode);
>>>> + const TargetRegisterClass *RC = MRI.getRegClass(Src0.getReg());
>>>> + const TargetRegisterClass *SubRC = RI.getSubRegClass(RC, AMDGPU::sub0);
>>>> + unsigned SrcReg0Sub0 = buildExtractSubReg(MII, MRI, Src0, RC,
>>>> + AMDGPU::sub0, SubRC);
>>>> + unsigned SrcReg1Sub0 = buildExtractSubReg(MII, MRI, Src1, RC,
>>>> + AMDGPU::sub0, SubRC);
>>>> +
>>>> + unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
>>>> + MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
>>>> + .addReg(SrcReg0Sub0)
>>>> + .addReg(SrcReg1Sub0);
>>>> +
>>>> + unsigned SrcReg0Sub1 = buildExtractSubReg(MII, MRI, Src0, RC,
>>>> + AMDGPU::sub1, SubRC);
>>>> + unsigned SrcReg1Sub1 = buildExtractSubReg(MII, MRI, Src1, RC,
>>>> + AMDGPU::sub1, SubRC);
>>>> +
>>>> + unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
>>>> + MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
>>>> + .addReg(SrcReg0Sub1)
>>>> + .addReg(SrcReg1Sub1);
>>>> +
>>>> + unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
>>>> + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
>>>> + .addReg(DestSub0)
>>>> + .addImm(AMDGPU::sub0)
>>>> + .addReg(DestSub1)
>>>> + .addImm(AMDGPU::sub1);
>>>> +
>>>> + MRI.replaceRegWith(Dest.getReg(), FullDestReg);
>>>> +
>>>> + // Try to legalize the operands in case we need to swap the order to keep it
>>>> + // valid.
>>>> + Worklist.push_back(LoHalf);
>>>> + Worklist.push_back(HiHalf);
>>>> +}
>>>> +
>>>> MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
>>>> MachineBasicBlock *MBB,
>>>> MachineBasicBlock::iterator I,
>>>> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
>>>> index 8c0fb6f..ca0b72b 100644
>>>> --- a/lib/Target/R600/SIInstrInfo.h
>>>> +++ b/lib/Target/R600/SIInstrInfo.h
>>>> @@ -38,6 +38,10 @@ private:
>>>> const TargetRegisterClass *RC,
>>>> const MachineOperand &Op) const;
>>>> + void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> + MachineInstr *Inst, unsigned Opcode) const;
>>>> +
>>>> +
>>>> public:
>>>> explicit SIInstrInfo(AMDGPUTargetMachine &tm);
>>>> @@ -92,6 +96,12 @@ public:
>>>> bool isSALUInstr(const MachineInstr &MI) const;
>>>> static unsigned getVALUOp(const MachineInstr &MI);
>>>> +
>>>> + /// \brief Return true if the opcode is a 64-bit opcode available on the SALU,
>>>> + /// but requires a more complex handling than those with a direct VALU
>>>> + /// equivalent.
>>>> + static bool isIllegalVALU64BitOp(unsigned Opcode);
>>>> +
>>>> bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
>>>> /// \brief Return the correct register class for \p OpNo. For target-specific
>>>> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
>>>> index eb10541..ba1d153 100644
>>>> --- a/lib/Target/R600/SIInstructions.td
>>>> +++ b/lib/Target/R600/SIInstructions.td
>>>> @@ -1234,7 +1234,7 @@ def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
>>>> >;
>>>> def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
>>>> - []
>>>> + [(set i64:$dst, (or i64:$src0, i64:$src1))]
>>>> >;
>>>> def : Pat <
>>>> diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
>>>> index 35fc8b3..05d1e0f 100644
>>>> --- a/test/CodeGen/R600/or.ll
>>>> +++ b/test/CodeGen/R600/or.ll
>>>> @@ -56,15 +56,34 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
>>>> ret void
>>>> }
>>>> -; EG-CHECK-LABEL: @or_i64
>>>> +; EG-CHECK-LABEL: @scalar_or_i64
>>>> ; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
>>>> ; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
>>>> -; SI-CHECK-LABEL: @or_i64
>>>> +; SI-CHECK-LABEL: @scalar_or_i64
>>>> +; SI-CHECK: S_OR_B64
>>>> +define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
>>>> + %or = or i64 %a, %b
>>>> + store i64 %or, i64 addrspace(1)* %out
>>>> + ret void
>>>> +}
>>>> +
>>>> +; SI-CHECK-LABEL: @vector_or_i64
>>>> ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> -define void @or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
>>>> -entry:
>>>> - %0 = or i64 %a, %b
>>>> - store i64 %0, i64 addrspace(1)* %out
>>>> - ret void
>>>> +define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
>>>> + %loada = load i64 addrspace(1)* %a, align 8
>>>> + %loadb = load i64 addrspace(1)* %a, align 8
>>>> + %or = or i64 %loada, %loadb
>>>> + store i64 %or, i64 addrspace(1)* %out
>>>> + ret void
>>>> +}
>>>> +
>>>> +; SI-CHECK-LABEL: @scalar_vector_or_i64
>>>> +; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> +; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> +define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
>>>> + %loada = load i64 addrspace(1)* %a
>>>> + %or = or i64 %loada, %b
>>>> + store i64 %or, i64 addrspace(1)* %out
>>>> + ret void
>>>> }
>>>> --
>>>> 1.8.3.2
>>>>
>>>> From 912fba570260eb06270e86455e08e15ed4fded51 Mon Sep 17 00:00:00 2001
>>>> From: Matt Arsenault <arsenm2 at gmail.com>
>>>> Date: Fri, 21 Mar 2014 15:50:38 -0700
>>>> Subject: [PATCH 3/4] R600/SI: Sub-optimial fix for 64-bit immediates with SALU
>>>> ops.
>>>>
>>>> No longer asserts, but now you get moves loading legal immediates
>>>> into the split 32-bit operations.
>>>> ---
>>>> lib/Target/R600/SIInstrInfo.cpp | 53 ++++++++++++++++-------
>>>> lib/Target/R600/SIInstrInfo.h | 8 +++-
>>>> test/CodeGen/R600/or.ll | 93 ++++++++++++++++++++++++++---------------
>>>> 3 files changed, 104 insertions(+), 50 deletions(-)
>>>>
>>>> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
>>>> index ba552c5..2e948f8 100644
>>>> --- a/lib/Target/R600/SIInstrInfo.cpp
>>>> +++ b/lib/Target/R600/SIInstrInfo.cpp
>>>> @@ -601,6 +601,28 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
>>>> return SubReg;
>>>> }
>>>> +MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
>>>> + MachineBasicBlock::iterator MII,
>>>> + MachineRegisterInfo &MRI,
>>>> + MachineOperand &Op,
>>>> + const TargetRegisterClass *SuperRC,
>>>> + unsigned SubIdx,
>>>> + const TargetRegisterClass *SubRC) const {
>>>> + if (Op.isImm()) {
>>>> + // XXX - Is there a better way to do this?
>>> I don't think so.
>>>
>>>> + if (SubIdx == AMDGPU::sub0)
>>>> + return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
>>>> + if (SubIdx == AMDGPU::sub1)
>>>> + return MachineOperand::CreateImm(Op.getImm() >> 32);
>>>> +
>>>> + llvm_unreachable("Unhandled register index for immediate");
>>>> + }
>>>> +
>>>> + unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
>>>> + SubIdx, SubRC);
>>>> + return MachineOperand::CreateReg(SubReg, false);
>>>> +}
>>>> +
>>>> unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> MachineBasicBlock::iterator MI,
>>>> MachineRegisterInfo &MRI,
>>>> @@ -1008,7 +1030,6 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> MachineBasicBlock &MBB = *Inst->getParent();
>>>> MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
>>>> - // We shouldn't need to worry about immediate operands here.
>>>> MachineOperand &Dest = Inst->getOperand(0);
>>>> MachineOperand &Src0 = Inst->getOperand(1);
>>>> MachineOperand &Src1 = Inst->getOperand(2);
>>>> @@ -1019,27 +1040,27 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> const MCInstrDesc &InstDesc = get(Opcode);
>>>> const TargetRegisterClass *RC = MRI.getRegClass(Src0.getReg());
>>>> const TargetRegisterClass *SubRC = RI.getSubRegClass(RC, AMDGPU::sub0);
>>>> - unsigned SrcReg0Sub0 = buildExtractSubReg(MII, MRI, Src0, RC,
>>>> - AMDGPU::sub0, SubRC);
>>>> - unsigned SrcReg1Sub0 = buildExtractSubReg(MII, MRI, Src1, RC,
>>>> - AMDGPU::sub0, SubRC);
>>>> + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, RC,
>>>> + AMDGPU::sub0, SubRC);
>>>> + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, RC,
>>>> + AMDGPU::sub0, SubRC);
>>>> - unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
>>>> + unsigned DestSub0 = MRI.createVirtualRegister(SubRC);
>>>> MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
>>>> - .addReg(SrcReg0Sub0)
>>>> - .addReg(SrcReg1Sub0);
>>>> + .addOperand(SrcReg0Sub0)
>>>> + .addOperand(SrcReg1Sub0);
>>>> - unsigned SrcReg0Sub1 = buildExtractSubReg(MII, MRI, Src0, RC,
>>>> - AMDGPU::sub1, SubRC);
>>>> - unsigned SrcReg1Sub1 = buildExtractSubReg(MII, MRI, Src1, RC,
>>>> - AMDGPU::sub1, SubRC);
>>>> + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, RC,
>>>> + AMDGPU::sub1, SubRC);
>>>> + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, RC,
>>>> + AMDGPU::sub1, SubRC);
>>>> - unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
>>>> + unsigned DestSub1 = MRI.createVirtualRegister(SubRC);
>>>> MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
>>>> - .addReg(SrcReg0Sub1)
>>>> - .addReg(SrcReg1Sub1);
>>>> + .addOperand(SrcReg0Sub1)
>>>> + .addOperand(SrcReg1Sub1);
>>>> - unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
>>>> + unsigned FullDestReg = MRI.createVirtualRegister(RC);
>>>> BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
>>>> .addReg(DestSub0)
>>>> .addImm(AMDGPU::sub0)
>>>> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
>>>> index ca0b72b..f8bae3a 100644
>>>> --- a/lib/Target/R600/SIInstrInfo.h
>>>> +++ b/lib/Target/R600/SIInstrInfo.h
>>>> @@ -31,6 +31,12 @@ private:
>>>> const TargetRegisterClass *SuperRC,
>>>> unsigned SubIdx,
>>>> const TargetRegisterClass *SubRC) const;
>>>> + MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
>>>> + MachineRegisterInfo &MRI,
>>>> + MachineOperand &SuperReg,
>>>> + const TargetRegisterClass *SuperRC,
>>>> + unsigned SubIdx,
>>>> + const TargetRegisterClass *SubRC) const;
>>>> unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> MachineBasicBlock::iterator MI,
>>>> @@ -38,7 +44,7 @@ private:
>>>> const TargetRegisterClass *RC,
>>>> const MachineOperand &Op) const;
>>>> - void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> + void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
>>>> MachineInstr *Inst, unsigned Opcode) const;
>>>> diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
>>>> index 05d1e0f..8e985c7 100644
>>>> --- a/test/CodeGen/R600/or.ll
>>>> +++ b/test/CodeGen/R600/or.ll
>>>> @@ -1,13 +1,13 @@
>>>> -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
>>>> -;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
>>>> +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
>>>> +;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
>>>> -; EG-CHECK-LABEL: @or_v2i32
>>>> -; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> -; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> +; EG-LABEL: @or_v2i32
>>>> +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> -;SI-CHECK-LABEL: @or_v2i32
>>>> -;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> -;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> +; SI-LABEL: @or_v2i32
>>>> +; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> +; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
>>>> %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
>>>> @@ -18,17 +18,17 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
>>>> ret void
>>>> }
>>>> -; EG-CHECK-LABEL: @or_v4i32
>>>> -; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> -; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> -; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> -; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> +; EG-LABEL: @or_v4i32
>>>> +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> +; EG: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
>>>> -;SI-CHECK-LABEL: @or_v4i32
>>>> -;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> -;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> -;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> -;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> +; SI-LABEL: @or_v4i32
>>>> +; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> +; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> +; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> +; SI: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
>>>> define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
>>>> %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
>>>> @@ -39,16 +39,16 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
>>>> ret void
>>>> }
>>>> -; SI-CHECK-LABEL: @scalar_or_i32
>>>> -; SI-CHECK: S_OR_B32
>>>> +; SI-LABEL: @scalar_or_i32
>>>> +; SI: S_OR_B32
>>>> define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
>>>> %or = or i32 %a, %b
>>>> store i32 %or, i32 addrspace(1)* %out
>>>> ret void
>>>> }
>>>> -; SI-CHECK-LABEL: @vector_or_i32
>>>> -; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> +; SI-LABEL: @vector_or_i32
>>>> +; SI: V_OR_B32_e32 v{{[0-9]}}
>>>> define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
>>>> %loada = load i32 addrspace(1)* %a
>>>> %or = or i32 %loada, %b
>>>> @@ -56,20 +56,20 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
>>>> ret void
>>>> }
>>>> -; EG-CHECK-LABEL: @scalar_or_i64
>>>> -; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
>>>> -; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
>>>> -; SI-CHECK-LABEL: @scalar_or_i64
>>>> -; SI-CHECK: S_OR_B64
>>>> +; EG-LABEL: @scalar_or_i64
>>>> +; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
>>>> +; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
>>>> +; SI-LABEL: @scalar_or_i64
>>>> +; SI: S_OR_B64
>>>> define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
>>>> %or = or i64 %a, %b
>>>> store i64 %or, i64 addrspace(1)* %out
>>>> ret void
>>>> }
>>>> -; SI-CHECK-LABEL: @vector_or_i64
>>>> -; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> -; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> +; SI-LABEL: @vector_or_i64
>>>> +; SI: V_OR_B32_e32 v{{[0-9]}}
>>>> +; SI: V_OR_B32_e32 v{{[0-9]}}
>>>> define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
>>>> %loada = load i64 addrspace(1)* %a, align 8
>>>> %loadb = load i64 addrspace(1)* %a, align 8
>>>> @@ -78,12 +78,39 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
>>>> ret void
>>>> }
>>>> -; SI-CHECK-LABEL: @scalar_vector_or_i64
>>>> -; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> -; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
>>>> +; SI-LABEL: @scalar_vector_or_i64
>>>> +; SI: V_OR_B32_e32 v{{[0-9]}}
>>>> +; SI: V_OR_B32_e32 v{{[0-9]}}
>>>> define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
>>>> %loada = load i64 addrspace(1)* %a
>>>> %or = or i64 %loada, %b
>>>> store i64 %or, i64 addrspace(1)* %out
>>>> ret void
>>>> }
>>>> +
>>>> +; SI-LABEL: @vector_or_i64_loadimm
>>>> +; SI-DAG: S_MOV_B32
>>>> +; SI-DAG: S_MOV_B32
>>>> +; SI-DAG: BUFFER_LOAD_DWORDX2
>>>> +; SI: V_OR_B32_e32
>>>> +; SI: V_OR_B32_e32
>>>> +; SI: S_ENDPGM
>>>> +define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
>>>> + %loada = load i64 addrspace(1)* %a, align 8
>>>> + %or = or i64 %loada, 22470723082367
>>>> + store i64 %or, i64 addrspace(1)* %out
>>>> + ret void
>>>> +}
>>>> +
>>>> +; FIXME: The or 0 should really be removed.
>>>> +; SI-LABEL: @vector_or_i64_imm
>>>> +; SI: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
>>>> +; SI: V_OR_B32_e32 {{v[0-9]+}}, 8, v[[LO_VREG]]
>>>> +; SI: V_OR_B32_e32 {{v[0-9]+}}, 0, {{.*}}
>>>> +; SI: S_ENDPGM
>>>> +define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
>>>> + %loada = load i64 addrspace(1)* %a, align 8
>>>> + %or = or i64 %loada, 8
>>>> + store i64 %or, i64 addrspace(1)* %out
>>>> + ret void
>>>> +}
>>>> --
>>>> 1.8.3.2
>>>>
>>>> From d4948cd6ff13398a74efce0573b6f89e5e7b9750 Mon Sep 17 00:00:00 2001
>>>> From: Matt Arsenault <arsenm2 at gmail.com>
>>>> Date: Fri, 21 Mar 2014 20:00:24 -0700
>>>> Subject: [PATCH 4/4] R600/SI: Fix extra mov from legalizing 64-bit SALU ops.
>>>>
>>>> Check the register class of each operand individually
>>>> to avoid an extra copy to a vgpr.
>>> I don't understand why this is necessary. Can you provide an example?
>>>
>>> Thanks,
>>> Tom
>> The test I filled out in it (@vector_or_i64_loadimm in or.ll), is
>> the example I ran into, which is why it the operands aren't checked
>> when originally added.
>>
>> It was something like s_or_b64 v0, v1, s0, where s0 is just loading
>> a constant. Using the register class for src0 for both operands
>> would end up loading the immediate twice, first s_mov into an SGPR,
>> then a v_mov into a VGPR.
>>
>>
>>>> ---
>>>> lib/Target/R600/SIInstrInfo.cpp | 40 ++++++++++++++++++++++++++--------------
>>>> test/CodeGen/R600/or.ll | 10 +++++-----
>>>> 2 files changed, 31 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
>>>> index 2e948f8..e187991 100644
>>>> --- a/lib/Target/R600/SIInstrInfo.cpp
>>>> +++ b/lib/Target/R600/SIInstrInfo.cpp
>>>> @@ -1038,29 +1038,41 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
>>>> MachineBasicBlock::iterator MII = Inst;
>>>> const MCInstrDesc &InstDesc = get(Opcode);
>>>> - const TargetRegisterClass *RC = MRI.getRegClass(Src0.getReg());
>>>> - const TargetRegisterClass *SubRC = RI.getSubRegClass(RC, AMDGPU::sub0);
>>>> - MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, RC,
>>>> - AMDGPU::sub0, SubRC);
>>>> - MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, RC,
>>>> - AMDGPU::sub0, SubRC);
>>>> -
>>>> - unsigned DestSub0 = MRI.createVirtualRegister(SubRC);
>>>> + const TargetRegisterClass *Src0RC = Src0.isReg() ?
>>>> + MRI.getRegClass(Src0.getReg()) :
>>>> + &AMDGPU::SGPR_32RegClass;
>>>> +
>>>> + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
>>>> + const TargetRegisterClass *Src1RC = Src1.isReg() ?
>>>> + MRI.getRegClass(Src1.getReg()) :
>>>> + &AMDGPU::SGPR_32RegClass;
>>>> +
>>>> + const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
>>>> +
>>>> + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
>>>> + AMDGPU::sub0, Src0SubRC);
>>>> + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
>>>> + AMDGPU::sub0, Src1SubRC);
>>>> +
>>>> + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
>>>> + const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0);
>>>> +
>>>> + unsigned DestSub0 = MRI.createVirtualRegister(DestRC);
>>>> MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
>>>> .addOperand(SrcReg0Sub0)
>>>> .addOperand(SrcReg1Sub0);
>>>> - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, RC,
>>>> - AMDGPU::sub1, SubRC);
>>>> - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, RC,
>>>> - AMDGPU::sub1, SubRC);
>>>> + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
>>>> + AMDGPU::sub1, Src0SubRC);
>>>> + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
>>>> + AMDGPU::sub1, Src1SubRC);
>>>> - unsigned DestSub1 = MRI.createVirtualRegister(SubRC);
>>>> + unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC);
>>>> MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
>>>> .addOperand(SrcReg0Sub1)
>>>> .addOperand(SrcReg1Sub1);
>>>> - unsigned FullDestReg = MRI.createVirtualRegister(RC);
>>>> + unsigned FullDestReg = MRI.createVirtualRegister(DestRC);
>>>> BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
>>>> .addReg(DestSub0)
>>>> .addImm(AMDGPU::sub0)
>>>> diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
>>>> index 8e985c7..be984b2 100644
>>>> --- a/test/CodeGen/R600/or.ll
>>>> +++ b/test/CodeGen/R600/or.ll
>>>> @@ -89,11 +89,11 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
>>>> }
>>>> ; SI-LABEL: @vector_or_i64_loadimm
>>>> -; SI-DAG: S_MOV_B32
>>>> -; SI-DAG: S_MOV_B32
>>>> -; SI-DAG: BUFFER_LOAD_DWORDX2
>>>> -; SI: V_OR_B32_e32
>>>> -; SI: V_OR_B32_e32
>>>> +; SI-DAG: S_MOV_B32 [[LO_S_IMM:s[0-9]+]], -545810305
>>>> +; SI-DAG: S_MOV_B32 [[HI_S_IMM:s[0-9]+]], 5231
>>>> +; SI-DAG: BUFFER_LOAD_DWORDX2 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}},
>>>> +; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
>>>> +; SI-DAG: V_OR_B32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
>>>> ; SI: S_ENDPGM
>>>> define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
>>>> %loada = load i64 addrspace(1)* %a, align 8
>>>> --
>>>> 1.8.3.2
>>>>
>>>> _______________________________________________
>>>> llvm-commits mailing list
>>>> llvm-commits at cs.uiuc.edu
>>>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>>
r204660-204662
More information about the llvm-commits
mailing list