[Mesa-dev] [PATCH] R600/SI: fix MIMG writemask adjustement
Marek Olšák
maraeo at gmail.com
Wed Oct 9 07:39:14 PDT 2013
No, I just added the test.
Marek
On Wed, Oct 9, 2013 at 4:28 PM, Christian König <deathsimple at vodafone.de> wrote:
> Are there any changes to the original patch you've send out?
>
> Anyway this version is: Reviewed-by: Christian König
> <christian.koenig at amd.com>
>
> Am 09.10.2013 15:33, schrieb Marek Olšák:
>
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> This fixes piglit:
>> - shaders/glsl-fs-texture2d-masked
>> - shaders/glsl-fs-texture2d-masked-4
>>
>> Signed-off-by: Marek Olšák <marek.olsak at amd.com>
>> ---
>> lib/Target/R600/SIISelLowering.cpp | 27 +++++++--
>> test/CodeGen/R600/llvm.SI.sample-masked.ll | 93
>> ++++++++++++++++++++++++++++++
>> 2 files changed, 114 insertions(+), 6 deletions(-)
>> create mode 100644 test/CodeGen/R600/llvm.SI.sample-masked.ll
>>
>> diff --git a/lib/Target/R600/SIISelLowering.cpp
>> b/lib/Target/R600/SIISelLowering.cpp
>> index 2174753..891a51b 100644
>> --- a/lib/Target/R600/SIISelLowering.cpp
>> +++ b/lib/Target/R600/SIISelLowering.cpp
>> @@ -1065,7 +1065,9 @@ static unsigned SubIdx2Lane(unsigned Idx) {
>> void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
>> SelectionDAG &DAG) const {
>> SDNode *Users[4] = { };
>> - unsigned Writemask = 0, Lane = 0;
>> + unsigned Lane = 0;
>> + unsigned OldDmask = Node->getConstantOperandVal(0);
>> + unsigned NewDmask = 0;
>> // Try to figure out the used register components
>> for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
>> @@ -1076,29 +1078,42 @@ void
>> SITargetLowering::adjustWritemask(MachineSDNode *&Node,
>> I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
>> return;
>> + /* Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
>> + * Note that subregs are packed, i.e. Lane==0 is the first bit set
>> + * in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second
>> bit
>> + * set, etc. */
>> Lane = SubIdx2Lane(I->getConstantOperandVal(1));
>> + // Set which texture component corresponds to the lane.
>> + unsigned Comp;
>> + for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
>> + assert(Dmask);
>> + Comp = ffs(Dmask)-1;
>> + Dmask &= ~(1 << Comp);
>> + }
>> +
>> // Abort if we have more than one user per component
>> if (Users[Lane])
>> return;
>> Users[Lane] = *I;
>> - Writemask |= 1 << Lane;
>> + NewDmask |= 1 << Comp;
>> }
>> - // Abort if all components are used
>> - if (Writemask == 0xf)
>> + // Abort if there's no change
>> + if (NewDmask == OldDmask)
>> return;
>> // Adjust the writemask in the node
>> std::vector<SDValue> Ops;
>> - Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
>> + Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
>> for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
>> Ops.push_back(Node->getOperand(i));
>> Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(),
>> Ops.size());
>> // If we only got one lane, replace it with a copy
>> - if (Writemask == (1U << Lane)) {
>> + // (if NewDmask has only one bit set...)
>> + if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
>> SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID,
>> MVT::i32);
>> SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
>> SDLoc(),
>> Users[Lane]->getValueType(0),
>> diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll
>> b/test/CodeGen/R600/llvm.SI.sample-masked.ll
>> new file mode 100644
>> index 0000000..1b4cc4e
>> --- /dev/null
>> +++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
>> @@ -0,0 +1,93 @@
>> +;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
>> +
>> +; CHECK: @v1
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 13
>> +define void @v1(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 0
>> + %3 = extractelement <4 x float> %1, i32 2
>> + %4 = extractelement <4 x float> %1, i32 3
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %4, float %4)
>> + ret void
>> +}
>> +
>> +; CHECK: @v2
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 11
>> +define void @v2(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 0
>> + %3 = extractelement <4 x float> %1, i32 1
>> + %4 = extractelement <4 x float> %1, i32 3
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %4, float %4)
>> + ret void
>> +}
>> +
>> +; CHECK: @v3
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 14
>> +define void @v3(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 1
>> + %3 = extractelement <4 x float> %1, i32 2
>> + %4 = extractelement <4 x float> %1, i32 3
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %4, float %4)
>> + ret void
>> +}
>> +
>> +; CHECK: @v4
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 7
>> +define void @v4(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 0
>> + %3 = extractelement <4 x float> %1, i32 1
>> + %4 = extractelement <4 x float> %1, i32 2
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %4, float %4)
>> + ret void
>> +}
>> +
>> +; CHECK: @v5
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}, 10
>> +define void @v5(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 1
>> + %3 = extractelement <4 x float> %1, i32 3
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %3, float %3)
>> + ret void
>> +}
>> +
>> +; CHECK: @v6
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}, 6
>> +define void @v6(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 1
>> + %3 = extractelement <4 x float> %1, i32 2
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %3, float %3)
>> + ret void
>> +}
>> +
>> +; CHECK: @v7
>> +; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}, 9
>> +define void @v7(i32 %a1) {
>> +entry:
>> + %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
>> + %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8>
>> undef, <16 x i8> undef, i32 0)
>> + %2 = extractelement <4 x float> %1, i32 0
>> + %3 = extractelement <4 x float> %1, i32 3
>> + call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float
>> %2, float %3, float %3, float %3)
>> + ret void
>> +}
>> +
>> +declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x
>> i8>, i32) readnone
>> +
>> +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float,
>> float, float)
>
>
More information about the llvm-commits
mailing list