[PATCH] D152317: [NVPTX] Allow using v4i32 for memcpy lowering.

Wed Jun 14 23:28:20 PDT 2023

dcaballe added a comment.

Hi Artem,

It looks like this commit is causing a massive compile time issue (a few seconds -> 1.5h) in one of our models in IREE.
I'm attaching a repro that you can pass to `llc`. 
Could you please take a look or revert? Thanks!

  ; ModuleID = 'main_dispatch_394'
  source_filename = "main_dispatch_394"
  target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
  target triple = "nvptx64-nvidia-cuda"

  ; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read, argmem: readwrite, inaccessiblemem: readwrite)
  define void @main_dispatch_394_generic_16x16x512x512_i32xi32xi32xf32(ptr noalias readonly align 16 %0, ptr noalias readonly align 16 %1, ptr noalias align 16 %2) local_unnamed_addr #0 {
    %4 = addrspacecast ptr %2 to ptr addrspace(1)
    %5 = addrspacecast ptr %1 to ptr addrspace(1)
    %6 = addrspacecast ptr %0 to ptr addrspace(1)
    %7 = getelementptr float, ptr addrspace(1) %6, i64 399581184
    %8 = getelementptr i32, ptr addrspace(1) %5, i64 8388608
    %9 = getelementptr i32, ptr addrspace(1) %5, i64 41943040
    %10 = getelementptr float, ptr addrspace(1) %4, i64 58720768
    %11 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
    %12 = zext i32 %11 to i64
    %13 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !5
    %14 = lshr i64 %12, 1
    %15 = and i64 %14, 511
    %16 = shl nuw nsw i64 %12, 8
    %17 = shl nuw nsw i32 %13, 2
    %18 = zext i32 %17 to i64
    %19 = or i64 %16, %18
    %20 = shl nuw nsw i64 %14, 9
    %21 = sub nsw i64 %19, %20
    %22 = shl nuw nsw i64 %15, 9
    %23 = add nsw i64 %22, %21
    %24 = getelementptr i32, ptr addrspace(1) %8, i64 %23
    %25 = load <4 x i32>, ptr addrspace(1) %24, align 16
    %26 = getelementptr i32, ptr addrspace(1) %5, i64 %15
    %27 = getelementptr i32, ptr addrspace(1) %26, i64 41943040
    %28 = load i32, ptr addrspace(1) %27, align 4
    %29 = getelementptr i32, ptr addrspace(1) %9, i64 %21
    %30 = load <4 x i32>, ptr addrspace(1) %29, align 16
    %31 = insertelement <4 x i32> undef, i32 %28, i64 0
    %32 = shufflevector <4 x i32> %31, <4 x i32> undef, <4 x i32> zeroinitializer
    %.not = icmp slt <4 x i32> %32, %30
    %33 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %25, <4 x i32> zeroinitializer)
    %34 = sub <4 x i32> zeroinitializer, %33
    %35 = sitofp <4 x i32> %34 to <4 x float>
    %36 = fmul <4 x float> %35, <float 6.250000e-02, float 6.250000e-02, float 6.250000e-02, float 6.250000e-02>
    %.inv = fcmp ole <4 x float> %36, <float 0x3810000000000000, float 0x3810000000000000, float 0x3810000000000000, float 0x3810000000000000>
    %37 = select <4 x i1> %.inv, <4 x float> <float 0x3810000000000000, float 0x3810000000000000, float 0x3810000000000000, float 0x3810000000000000>, <4 x float> %36
    %38 = bitcast <4 x float> %37 to <4 x i32>
    %39 = and <4 x i32> %38, <i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041>
    %40 = or <4 x i32> %39, <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
    %41 = bitcast <4 x i32> %40 to <4 x float>
    %42 = lshr <4 x i32> %38, <i32 23, i32 23, i32 23, i32 23>
    %43 = sitofp <4 x i32> %42 to <4 x float>
    %44 = fadd <4 x float> %43, <float -1.260000e+02, float -1.260000e+02, float -1.260000e+02, float -1.260000e+02>
    %45 = fcmp olt <4 x float> %41, <float 0x3FE6A09E60000000, float 0x3FE6A09E60000000, float 0x3FE6A09E60000000, float 0x3FE6A09E60000000>
    %46 = select <4 x i1> %45, <4 x float> %41, <4 x float> zeroinitializer
    %47 = fadd <4 x float> %41, <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>
    %48 = select <4 x i1> %45, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> zeroinitializer
    %49 = fsub <4 x float> %44, %48
    %50 = fadd <4 x float> %47, %46
    %51 = fmul <4 x float> %50, %50
    %52 = fmul <4 x float> %50, %51
    %53 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %50, <4 x float> <float 0x3FB2043760000000, float 0x3FB2043760000000, float 0x3FB2043760000000, float 0x3FB2043760000000>, <4 x float> <float 0xBFBD7A3700000000, float 0xBFBD7A3700000000, float 0xBFBD7A3700000000, float 0xBFBD7A3700000000>)
    %54 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %50, <4 x float> <float 0xBFBFCBA9E0000000, float 0xBFBFCBA9E0000000, float 0xBFBFCBA9E0000000, float 0xBFBFCBA9E0000000>, <4 x float> <float 0x3FC23D37E0000000, float 0x3FC23D37E0000000, float 0x3FC23D37E0000000, float 0x3FC23D37E0000000>)
    %55 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %50, <4 x float> <float 0x3FC999D580000000, float 0x3FC999D580000000, float 0x3FC999D580000000, float 0x3FC999D580000000>, <4 x float> <float 0xBFCFFFFF80000000, float 0xBFCFFFFF80000000, float 0xBFCFFFFF80000000, float 0xBFCFFFFF80000000>)
    %56 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %53, <4 x float> %50, <4 x float> <float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000>)
    %57 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %54, <4 x float> %50, <4 x float> <float 0xBFC555CA00000000, float 0xBFC555CA00000000, float 0xBFC555CA00000000, float 0xBFC555CA00000000>)
    %58 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %55, <4 x float> %50, <4 x float> <float 0x3FD5555540000000, float 0x3FD5555540000000, float 0x3FD5555540000000, float 0x3FD5555540000000>)
    %59 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %56, <4 x float> %52, <4 x float> %57)
    %60 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %59, <4 x float> %52, <4 x float> %58)
    %61 = fmul <4 x float> %52, %60
    %62 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %51, <4 x float> <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>, <4 x float> %61)
    %63 = fadd <4 x float> %50, %62
    %64 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %49, <4 x float> <float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000, float 0x3FE62E4300000000>, <4 x float> %63)
    %65 = fcmp ult <4 x float> %36, zeroinitializer
    %66 = fcmp oeq <4 x float> %36, zeroinitializer
    %67 = fcmp oeq <4 x float> %36, <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
    %68 = fdiv <4 x float> %64, <float 0x4000A2B240000000, float 0x4000A2B240000000, float 0x4000A2B240000000, float 0x4000A2B240000000>
    %69 = fmul <4 x float> %68, <float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01>
    %70 = fadd <4 x float> %69, <float 1.600000e+01, float 1.600000e+01, float 1.600000e+01, float 1.600000e+01>
    %71 = select <4 x i1> %67, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>, <4 x float> %70
    %72 = select <4 x i1> %65, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000>, <4 x float> %71
    %73 = select <4 x i1> %66, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>, <4 x float> %72
    %.inv1 = fcmp oge <4 x float> %73, <float 3.100000e+01, float 3.100000e+01, float 3.100000e+01, float 3.100000e+01>
    %74 = select <4 x i1> %.inv1, <4 x float> <float 3.100000e+01, float 3.100000e+01, float 3.100000e+01, float 3.100000e+01>, <4 x float> %73
    %75 = icmp slt <4 x i32> %34, <i32 16, i32 16, i32 16, i32 16>
    %76 = select <4 x i1> %75, <4 x float> %35, <4 x float> %74
    %77 = fadd <4 x float> %76, zeroinitializer
    %78 = fptosi <4 x float> %77 to <4 x i32>
    %79 = add <4 x i32> %78, <i32 32, i32 32, i32 32, i32 32>
    %80 = icmp slt <4 x i32> %78, zeroinitializer
    %81 = select <4 x i1> %80, <4 x i32> %79, <4 x i32> %78
    %82 = lshr i64 %12, 10
    %83 = lshr i64 %12, 14
    %84 = shl nuw nsw i64 %83, 4
    %.scalar = sub nsw i64 %82, %84
    %85 = insertelement <4 x i64> undef, i64 %.scalar, i64 0
    %86 = shufflevector <4 x i64> %85, <4 x i64> poison, <4 x i32> zeroinitializer
    %87 = sext <4 x i32> %81 to <4 x i64>
    %88 = shl nsw <4 x i64> %87, <i64 4, i64 4, i64 4, i64 4>
    %89 = add <4 x i64> %88, %86
    %90 = getelementptr float, ptr addrspace(1) %7, <4 x i64> %89
    %Ptr0 = extractelement <4 x ptr addrspace(1)> %90, i64 0
    %Load0 = load float, ptr addrspace(1) %Ptr0, align 4
    %Res0 = insertelement <4 x float> poison, float %Load0, i64 0
    %Ptr1 = extractelement <4 x ptr addrspace(1)> %90, i64 1
    %Load1 = load float, ptr addrspace(1) %Ptr1, align 4
    %Res1 = insertelement <4 x float> %Res0, float %Load1, i64 1
    %Ptr2 = extractelement <4 x ptr addrspace(1)> %90, i64 2
    %Load2 = load float, ptr addrspace(1) %Ptr2, align 4
    %Res2 = insertelement <4 x float> %Res1, float %Load2, i64 2
    %Ptr3 = extractelement <4 x ptr addrspace(1)> %90, i64 3
    %Load3 = load float, ptr addrspace(1) %Ptr3, align 4
    %Res3 = insertelement <4 x float> %Res2, float %Load3, i64 3
    %91 = select <4 x i1> %.not, <4 x float> <float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000>, <4 x float> zeroinitializer
    %92 = icmp ult <4 x i32> %81, <i32 32, i32 32, i32 32, i32 32>
    %93 = select <4 x i1> %92, <4 x float> %Res3, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000>
    %94 = fadd <4 x float> %91, %93
    %95 = shl nuw nsw i64 %83, 22
    %96 = shl nuw nsw i64 %82, 18
    %97 = and i64 %96, 3932160
    %98 = or i64 %97, %95
    %99 = or i64 %98, %22
    %100 = add nsw i64 %99, %21
    %101 = getelementptr float, ptr addrspace(1) %10, i64 %100
    store <4 x float> %94, ptr addrspace(1) %101, align 16
    ret void
  }

  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
  declare void @llvm.assume(i1 noundef) #1

  ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
  declare noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #2

  ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
  declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2

  ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
  declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) #2

  ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
  declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #2

  ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(read)
  declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x float>) #3

  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
  declare <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)>, i32 immarg, <4 x i1>, <4 x float>) #4

  attributes #0 = { mustprogress nofree nosync nounwind willreturn memory(read, argmem: readwrite, inaccessiblemem: readwrite) }
  attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
  attributes #2 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
  attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(read) }

  !nvvm.annotations = !{!0, !1, !2, !3}

  !0 = !{ptr @main_dispatch_394_generic_16x16x512x512_i32xi32xi32xf32, !"kernel", i32 1}
  !1 = !{ptr @main_dispatch_394_generic_16x16x512x512_i32xi32xi32xf32, !"maxntidx", i32 64}
  !2 = !{ptr @main_dispatch_394_generic_16x16x512x512_i32xi32xi32xf32, !"maxntidy", i32 1}
  !3 = !{ptr @main_dispatch_394_generic_16x16x512x512_i32xi32xi32xf32, !"maxntidz", i32 1}
  !4 = !{i32 0, i32 2147483647}
  !5 = !{i32 0, i32 64}

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D152317/new/

https://reviews.llvm.org/D152317