[PATCH] D129735: [WIP][RISCV] Add new pass to transform undef to pesudo for vector values.

Thu Oct 20 23:26:09 PDT 2022

craig.topper added a comment.

In D129735#3873436 <https://reviews.llvm.org/D129735#3873436>, @BeMg wrote:

> In D129735#3871632 <https://reviews.llvm.org/D129735#3871632>, @craig.topper wrote:
>
>> Does this patch work for this test case
>>
>>   define internal void @foo() {
>>   loopIR.preheader.i.i:
>>     %v15 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
>>     %v17 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15, i64 0)
>>     %vs12.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
>>     %v18 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i, i64 0)
>>     %vs16.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
>>     %v20 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i, i64 0)
>>     br label %loopIR3.i.i
>>   
>>   loopIR3.i.i:                                      ; preds = %loopIR3.i.i, %loopIR.preheader.i.i
>>     %v37 = load <vscale x 8 x i8>, ptr addrspace(1) null, align 8
>>     %v38 = tail call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> %v37, <vscale x 8 x i16> %v17, i64 4)
>>     %v40 = tail call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> %v37, <vscale x 8 x i16> %v18, i64 4)
>>     %v42 = and <vscale x 8 x i8> %v38, %v40
>>     %v46 = tail call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> undef, <vscale x 8 x i8> %v37, <vscale x 8 x i16> %v20, i64 4)
>>     %v60 = and <vscale x 8 x i8> %v42, %v46
>>     store <vscale x 8 x i8> %v60, ptr addrspace(1) null, align 4
>>     br label %loopIR3.i.i
>>   }
>>   
>>   declare <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
>>   
>>   declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16>, <vscale x 1 x i16>, i64 immarg) 
>>   
>>   declare <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i16>, i64)
>
> This IR doesn't generate the undef+early-clobber situation, so this pass will not work on it.
>
> RA result will not break the early-clobber constraint in current compiler.
>
> Corrsponding MachineInst before RA place below:
>
>   ********** MACHINEINSTRS **********
>   # Machine code for function foo: NoPHIs, TracksLiveness, TiedOpsRewritten, TracksDebugUserValues
>   
>   0B      bb.0.loopIR.preheader.i.i:
>             successors: %bb.1(0x80000000); %bb.1(100.00%)
>   
>   16B       dead %16:gpr = PseudoVSETVLIX0 $x0, 206, implicit-def $vl, implicit-def $vtype
>   32B       undef %0.sub_vrm1_0:vrm2 = PseudoVID_V_MF4 -1, 4, implicit $vl, implicit $vtype
>   64B       undef %1.sub_vrm1_0:vrm2 = PseudoVADD_VI_MF4 %0.sub_vrm1_0:vrm2, 1, -1, 4, implicit $vl, implicit $vtype
>   96B       undef %2.sub_vrm1_0:vrm2 = PseudoVADD_VI_MF4 %0.sub_vrm1_0:vrm2, 3, -1, 4, implicit $vl, implicit $vtype
>   
>   128B    bb.1.loopIR3.i.i:
>           ; predecessors: %bb.0, %bb.1
>             successors: %bb.1(0x80000000); %bb.1(100.00%)
>   
>   160B      %10:vr = VL1RE8_V $x0 :: (load unknown-size from `ptr addrspace(1) null`, align 8, addrspace 1)
>   176B      dead $x0 = PseudoVSETIVLI 4, 192, implicit-def $vl, implicit-def $vtype
>   192B      early-clobber %11:vr = PseudoVRGATHEREI16_VV_M1_M2 %10:vr, %0:vrm2, 4, 3, implicit $vl, implicit $vtype
>   208B      early-clobber %12:vr = PseudoVRGATHEREI16_VV_M1_M2 %10:vr, %1:vrm2, 4, 3, implicit $vl, implicit $vtype
>   224B      dead %17:gpr = PseudoVSETVLIX0 $x0, 192, implicit-def $vl, implicit-def $vtype
>   240B      %13:vr = PseudoVAND_VV_M1 %11:vr, %12:vr, -1, 3, implicit $vl, implicit $vtype
>   256B      dead $x0 = PseudoVSETIVLI 4, 192, implicit-def $vl, implicit-def $vtype
>   272B      early-clobber %14:vr = PseudoVRGATHEREI16_VV_M1_M2 %10:vr, %2:vrm2, 4, 3, implicit $vl, implicit $vtype
>   288B      dead %18:gpr = PseudoVSETVLIX0 $x0, 192, implicit-def $vl, implicit-def $vtype
>   304B      %15:vr = PseudoVAND_VV_M1 %13:vr, %14:vr, -1, 3, implicit $vl, implicit $vtype
>   320B      VS1R_V %15:vr, $x0 :: (store unknown-size into `ptr addrspace(1) null`, align 4, addrspace 1)
>   336B      PseudoBR %bb.1

Generated assembly see the note inline.

  foo:                                    # @foo
  	.cfi_startproc
  # %bb.0:                                # %loopIR.preheader.i.i
  	vsetvli	a0, zero, e16, mf4, ta, ma
  	vid.v	v8
  	vadd.vi	v10, v8, 1
  	vadd.vi	v12, v8, 3
  .LBB0_1:                                # %loopIR3.i.i
                                          # =>This Inner Loop Header: Depth=1
  	vl1r.v	v14, (zero)
  	vsetivli	zero, 4, e8, m1, ta, ma
  	vrgatherei16.vv	v15, v14, v8  <- The v14 here is LMUL=2 so it's v14 and v15. This means writing v15 violated the early clobber constraint.
  	vrgatherei16.vv	v16, v14, v10
  	vsetvli	a0, zero, e8, m1, ta, ma
  	vand.vv	v15, v15, v16
  	vsetivli	zero, 4, e8, m1, ta, ma
  	vrgatherei16.vv	v16, v14, v12
  	vsetvli	a0, zero, e8, m1, ta, ma
  	vand.vv	v14, v15, v16
  	vs1r.v	v14, (zero)
  	j	.LBB0_1
  .Lfunc_end0:
  	.size	foo, .Lfunc_end0-foo
  	.cfi_endproc
                                          # -- End function
  	.section	".note.GNU-stack","", at progbits

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129735/new/

https://reviews.llvm.org/D129735