[LLVMdev] Help with hazards

Wed Dec 14 11:44:16 PST 2011

The scoreboard hazard detector that I've added for the PPC 440 is not
detecting hazards as it should (which certainly could be my fault
somehow, but...). For example, it will produce a schedule that looks
like...

SU(28): 0x127969b0: f64,ch = LFD 0x12793aa0, 0x1277b4f0,
0x127965b0<Mem:LD8[%scevgep100](tbaa=!"double")> [ORD=41] [ID=28]
SU(46): 0x12796ab0: f64 = FADD 0x127969b0, 0x127968b0 [ORD=42] [ID=46]
SU(27): 0x12796cb0: ch = STFD 0x12796ab0, 0x12793aa0, 0x1277b3f0,
0x127969b0:1<Mem:ST8[%scevgep103](tbaa=!"double")> [ORD=46] [ID=27]
SU(26): 0x127970b0: f64,ch = LFD 0x127941a0, 0x1277b4f0,
0x12796cb0<Mem:LD8[%scevgep94](align=16)(tbaa=!"double")> [ORD=50]
[ID=26]
SU(47): 0x127972c0: f64 = FADD 0x127970b0, 0x127968b0 [ORD=51] [ID=47]
SU(25): 0x127974c0: ch = STFD 0x127972c0, 0x127941a0, 0x1277b3f0,
0x127970b0:1<Mem:ST8[%scevgep97](align=16)(tbaa=!"double")> [ORD=55]
[ID=25]

in other words, it produces a set of load, add, store triples,
non-interleaved, in order. The problem is that the result of the load is
not immediately available, and either is the result of the add. The
loads are covered by the itinerary:

  InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
                               InstrStage<1, [PDCD1, PDCD2]>,
                               InstrStage<1, [DISS1, DISS2]>,
                               InstrStage<1, [LRACC]>,
                               InstrStage<1, [AGEN]>,
                               InstrStage<1, [CRD]>,
                               InstrStage<2, [LWB]>],
                              [9, 5, 5],
                              [NoBypass, GPR_Bypass, GPR_Bypass]>,

the add is covered by the itinerary:

  InstrItinData<FPGeneral   , [InstrStage<1, [IFTH1, IFTH2]>,
                               InstrStage<1, [PDCD1, PDCD2]>,
                               InstrStage<1, [DISS1, DISS2]>,
                               InstrStage<1, [FRACC]>,
                               InstrStage<1, [FEXE1]>,
                               InstrStage<1, [FEXE2]>,
                               InstrStage<1, [FEXE3]>,
                               InstrStage<1, [FEXE4]>,
                               InstrStage<1, [FEXE5]>,
                               InstrStage<1, [FEXE6]>,
                               InstrStage<1, [FWB]>],
                              [10, 4, 4],
                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,

the store is covered by:

  InstrItinData<LdStUX      , [InstrStage<1, [IFTH1, IFTH2]>,
                               InstrStage<1, [PDCD1, PDCD2]>,
                               InstrStage<1, [DISS1, DISS2]>,
                               InstrStage<1, [LRACC]>,
                               InstrStage<1, [AGEN]>,
                               InstrStage<1, [CRD]>,
                               InstrStage<1, [LWB]>],
                              [8, 5, 5],
                              [NoBypass, GPR_Bypass, GPR_Bypass]>,

So, say that the load dispatches in cycle 1. According to the itinerary,
the result of the load is not available until cycle 9. The add
dispatches in the same cycle, or one cycle later. In the best case, it
dispatches one cycle later (in cycle 2). It expects to read its inputs 4
cycles later in cycle number 6. The input, however, will not be
available until cycle 9 yielding a 3 cycle stall. As far as I can tell
by looking at the debug output, no hazard was reported by the scoreboard
detector. Is this a bug or am I doing something wrong?

I've attached a small test case, run with llc -mcpu=440

Thanks again,
Hal

-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory
-------------- next part --------------
; ModuleID = 'tsc_s000.c'
target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
target triple = "powerpc-unknown-linux-gnu"

@.str = private unnamed_addr constant [6 x i8] c"s000 \00", align 1
@Y = common global [16000 x double] zeroinitializer, align 16
@X = common global [16000 x double] zeroinitializer, align 16
@Z = common global [16000 x double] zeroinitializer, align 16
@U = common global [16000 x double] zeroinitializer, align 16
@V = common global [16000 x double] zeroinitializer, align 16
@aa = common global [256 x [256 x double]] zeroinitializer, align 16
@bb = common global [256 x [256 x double]] zeroinitializer, align 16
@cc = common global [256 x [256 x double]] zeroinitializer, align 16
@.str1 = private unnamed_addr constant [14 x i8] c"S000\09 %.2f \09\09\00", align 1
@array = common global [65536 x double] zeroinitializer, align 16
@x = common global [16000 x double] zeroinitializer, align 16
@temp = common global double 0.000000e+00, align 8
@temp_int = common global i32 0, align 4
@a = common global [16000 x double] zeroinitializer, align 16
@b = common global [16000 x double] zeroinitializer, align 16
@c = common global [16000 x double] zeroinitializer, align 16
@d = common global [16000 x double] zeroinitializer, align 16
@e = common global [16000 x double] zeroinitializer, align 16
@tt = common global [256 x [256 x double]] zeroinitializer, align 16
@indx = common global [16000 x i32] zeroinitializer, align 16
@xx = common global double* null, align 4
@yy = common global double* null, align 4
@str = internal unnamed_addr constant [29 x i8] c"Loop \09 Time(Sec) \09 Checksum \00"

define i32 @s000() nounwind {
entry:
  %call = tail call i32 bitcast (i32 (...)* @init to i32 (i8*)*)(i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0)) nounwind
  %call1 = tail call i32 @clock() nounwind
  br label %for.cond2.preheader

for.cond2.preheader:                              ; preds = %for.end, %entry
  %nl.014 = phi i32 [ 0, %entry ], [ %inc8, %for.end ]
  br label %for.body4

for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
  %i.013 = phi i32 [ 0, %for.cond2.preheader ], [ %inc.15, %for.body4 ]
  %arrayidx = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %i.013
  %0 = load double* %arrayidx, align 16, !tbaa !0
  %add = fadd double %0, 1.000000e+00
  %arrayidx5 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %i.013
  store double %add, double* %arrayidx5, align 16, !tbaa !0
  %inc15 = or i32 %i.013, 1
  %arrayidx.1 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc15
  %1 = load double* %arrayidx.1, align 8, !tbaa !0
  %add.1 = fadd double %1, 1.000000e+00
  %arrayidx5.1 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc15
  store double %add.1, double* %arrayidx5.1, align 8, !tbaa !0
  %inc.116 = or i32 %i.013, 2
  %arrayidx.2 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.116
  %2 = load double* %arrayidx.2, align 16, !tbaa !0
  %add.2 = fadd double %2, 1.000000e+00
  %arrayidx5.2 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.116
  store double %add.2, double* %arrayidx5.2, align 16, !tbaa !0
  %inc.217 = or i32 %i.013, 3
  %arrayidx.3 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.217
  %3 = load double* %arrayidx.3, align 8, !tbaa !0
  %add.3 = fadd double %3, 1.000000e+00
  %arrayidx5.3 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.217
  store double %add.3, double* %arrayidx5.3, align 8, !tbaa !0
  %inc.318 = or i32 %i.013, 4
  %arrayidx.4 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.318
  %4 = load double* %arrayidx.4, align 16, !tbaa !0
  %add.4 = fadd double %4, 1.000000e+00
  %arrayidx5.4 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.318
  store double %add.4, double* %arrayidx5.4, align 16, !tbaa !0
  %inc.419 = or i32 %i.013, 5
  %arrayidx.5 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.419
  %5 = load double* %arrayidx.5, align 8, !tbaa !0
  %add.5 = fadd double %5, 1.000000e+00
  %arrayidx5.5 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.419
  store double %add.5, double* %arrayidx5.5, align 8, !tbaa !0
  %inc.520 = or i32 %i.013, 6
  %arrayidx.6 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.520
  %6 = load double* %arrayidx.6, align 16, !tbaa !0
  %add.6 = fadd double %6, 1.000000e+00
  %arrayidx5.6 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.520
  store double %add.6, double* %arrayidx5.6, align 16, !tbaa !0
  %inc.621 = or i32 %i.013, 7
  %arrayidx.7 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.621
  %7 = load double* %arrayidx.7, align 8, !tbaa !0
  %add.7 = fadd double %7, 1.000000e+00
  %arrayidx5.7 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.621
  store double %add.7, double* %arrayidx5.7, align 8, !tbaa !0
  %inc.722 = or i32 %i.013, 8
  %arrayidx.8 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.722
  %8 = load double* %arrayidx.8, align 16, !tbaa !0
  %add.8 = fadd double %8, 1.000000e+00
  %arrayidx5.8 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.722
  store double %add.8, double* %arrayidx5.8, align 16, !tbaa !0
  %inc.823 = or i32 %i.013, 9
  %arrayidx.9 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.823
  %9 = load double* %arrayidx.9, align 8, !tbaa !0
  %add.9 = fadd double %9, 1.000000e+00
  %arrayidx5.9 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.823
  store double %add.9, double* %arrayidx5.9, align 8, !tbaa !0
  %inc.924 = or i32 %i.013, 10
  %arrayidx.10 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.924
  %10 = load double* %arrayidx.10, align 16, !tbaa !0
  %add.10 = fadd double %10, 1.000000e+00
  %arrayidx5.10 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.924
  store double %add.10, double* %arrayidx5.10, align 16, !tbaa !0
  %inc.1025 = or i32 %i.013, 11
  %arrayidx.11 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.1025
  %11 = load double* %arrayidx.11, align 8, !tbaa !0
  %add.11 = fadd double %11, 1.000000e+00
  %arrayidx5.11 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.1025
  store double %add.11, double* %arrayidx5.11, align 8, !tbaa !0
  %inc.1126 = or i32 %i.013, 12
  %arrayidx.12 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.1126
  %12 = load double* %arrayidx.12, align 16, !tbaa !0
  %add.12 = fadd double %12, 1.000000e+00
  %arrayidx5.12 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.1126
  store double %add.12, double* %arrayidx5.12, align 16, !tbaa !0
  %inc.1227 = or i32 %i.013, 13
  %arrayidx.13 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.1227
  %13 = load double* %arrayidx.13, align 8, !tbaa !0
  %add.13 = fadd double %13, 1.000000e+00
  %arrayidx5.13 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.1227
  store double %add.13, double* %arrayidx5.13, align 8, !tbaa !0
  %inc.1328 = or i32 %i.013, 14
  %arrayidx.14 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.1328
  %14 = load double* %arrayidx.14, align 16, !tbaa !0
  %add.14 = fadd double %14, 1.000000e+00
  %arrayidx5.14 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.1328
  store double %add.14, double* %arrayidx5.14, align 16, !tbaa !0
  %inc.1429 = or i32 %i.013, 15
  %arrayidx.15 = getelementptr inbounds [16000 x double]* @Y, i32 0, i32 %inc.1429
  %15 = load double* %arrayidx.15, align 8, !tbaa !0
  %add.15 = fadd double %15, 1.000000e+00
  %arrayidx5.15 = getelementptr inbounds [16000 x double]* @X, i32 0, i32 %inc.1429
  store double %add.15, double* %arrayidx5.15, align 8, !tbaa !0
  %inc.15 = add nsw i32 %i.013, 16
  %exitcond.15 = icmp eq i32 %inc.15, 16000
  br i1 %exitcond.15, label %for.end, label %for.body4

for.end:                                          ; preds = %for.body4
  %call6 = tail call i32 @dummy(double* getelementptr inbounds ([16000 x double]* @X, i32 0, i32 0), double* getelementptr inbounds ([16000 x double]* @Y, i32 0, i32 0), double* getelementptr inbounds ([16000 x double]* @Z, i32 0, i32 0), double* getelementptr inbounds ([16000 x double]* @U, i32 0, i32 0), double* getelementptr inbounds ([16000 x double]* @V, i32 0, i32 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @aa, i32 0, i32 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @bb, i32 0, i32 0), [256 x double]* getelementptr inbounds ([256 x [256 x double]]* @cc, i32 0, i32 0), double 0.000000e+00) nounwind
  %inc8 = add nsw i32 %nl.014, 1
  %exitcond = icmp eq i32 %inc8, 400000
  br i1 %exitcond, label %for.end9, label %for.cond2.preheader

for.end9:                                         ; preds = %for.end
  %call10 = tail call i32 @clock() nounwind
  %sub = sub nsw i32 %call10, %call1
  %conv = sitofp i32 %sub to double
  %div = fdiv double %conv, 1.000000e+06
  %call11 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([14 x i8]* @.str1, i32 0, i32 0), double %div) nounwind
  %call12 = tail call i32 bitcast (i32 (...)* @check to i32 (i32)*)(i32 1) nounwind
  ret i32 0
}

declare i32 @init(...)

declare i32 @clock() nounwind

declare i32 @dummy(double*, double*, double*, double*, double*, [256 x double]*, [256 x double]*, [256 x double]*, double)

declare i32 @printf(i8* nocapture, ...) nounwind

declare i32 @check(...)

define i32 @main() nounwind {
entry:
  %puts = tail call i32 @puts(i8* getelementptr inbounds ([29 x i8]* @str, i32 0, i32 0))
  %call1 = tail call i32 @s000()
  ret i32 0
}

declare i32 @puts(i8* nocapture) nounwind

!0 = metadata !{metadata !"double", metadata !1}
!1 = metadata !{metadata !"omnipotent char", metadata !2}
!2 = metadata !{metadata !"Simple C/C++ TBAA", null}