Table of Contents

Check if the cpu_factor is a good normalization of cpu time and wall time

Load Data

Get dashboard data

Define data

fromDateString="20140801"
toDateString="20140930"
from=ymd(fromDateString)
to=ymd(toDateString)
cacheDataFolder="/afs/cern.ch/project/log_cor/dashb/cached_data"
options("mc.cores" = 6)
dsDashboard = DashboardDataSource$new(from,to,experiment,
                                                                            cacheDataFolder=cacheDataFolder,
                                                                            useOldCache=TRUE,
                                                                            createDailyCache=TRUE,
                                                                            createAggregatedCache=TRUE)  
## DashboardDataSource loading data
## [1] "Loading data from aggregated cache"
## [1] "time to load all:"
##    user  system elapsed 
##   2.691   0.162   2.856
## loaded: 599944.33 entries/s
dash=dsDashboard$entries
prettyNum(nrow(dash),big.mark=",")
## [1] "1,713,441"

Clean

Clean data by removing results that are not interesting

dash= dash[events>0]
dash= dash[cpu>0]
dash= dash[wall>0]
prettyNum(nrow(dash),big.mark=",")
## [1] "599,038"
if(experiment =="atlas"){
    dash[inputfiletype==""]$inputfiletype= "NO_ENTRY"   
}


dash= clean.factors(dash , clear.blank = FALSE, drop.levels = TRUE, verbose=FALSE )

Join with lshost

joined= addLshost2Dashboard(dash)
prettyNum(nrow(joined),big.mark=",")
## [1] "400,349"

Pre-calculate rates

joined$io = joined$wall - joined$cpu
joined$cpu_norm= (joined$cpu * joined$cpuf) 
joined$cpu_norm_norm= (joined$cpu_norm / joined$events) 

joined$wall_norm = joined$io + joined$cpu_norm
joined$wall_naive_norm= joined$wall * joined$cpuf
joined$wall_norm_norm = joined$io + joined$cpu_norm/joined$events

joined$cpu_rate= joined$cpu / joined$wall
joined$cpu_rate_norm= joined$cpu_norm /joined$wall_norm

Prepare nice concise dataset

Get ranking of large groups

if(experiment=="atlas"){
    ranking = get_group_rankings(joined,"inputfileproject",5000)
    data= joined[inputfileproject %in% (ranking$inputfileproject)]

    title=paste0(experiment," - ",min(data$startDashboard)," to ",max(data$end.time),"    ",prettyNum(nrow(data),big.mark=",")," items")

    #' check for a good test group
    ggplot(data)+geom_density(aes(x=cpu,colour=inputfileproject)  )  +labs(title=title)

    #' Select group mc12_8Tev (biggest group with a concise peak)
    data= joined[inputfileproject=="mc12_8TeV"]

    #' Display the influence of the cpufactor on the cpu distribution
    ggplot(data)+geom_density(aes(x=cpu, colour=as.factor(cpuf) )  ) +geom_density(aes(x=cpu)  ) 


} else if(experiment =="cms"){

    ranking = get_group_rankings(joined,"taskname",5000)
    data= joined[taskname %in% (ranking$taskname)]

    title=paste0(experiment," - ",min(data$startDashboard)," to ",max(data$finished),"    ",prettyNum(nrow(data),big.mark=",")," items")

    #' check for a good test group
    print(ggplot(data)+geom_density(aes(x=cpu,colour=taskname)  )  +labs(title=title) )

    #' Select group wmagent_amaltaro_FSQ-ppSpring2014-00007_ForceCompletion_140828_201751_9624 (big group with a concise peak)
    data= joined[taskname=="wmagent_amaltaro_FSQ-ppSpring2014-00007_ForceCompletion_140828_201751_9624"]

    #' Display the influence of io on these jobs
    print( ggplot(data)+geom_density(aes(x=cpu_rate,colour=taskname)  )  +labs(title=title) )
    #' * There is a reasonable io part of about ~30% on average
    #' * This means we can check the effect of wall normalization, compared to cpu normalization

}

plot of chunk unnamed-chunk-5 plot of chunk unnamed-chunk-5

aggregate by model and inspect deviation between models

data_by_model= data[, list(
    cpu.mean= mean(cpu),
    cpu.sd= sd(cpu),
    cpu.ci_low = CI( cpu, ci= .95 )[3],
    cpu.ci_high = CI( cpu, ci= .95 )[1],

    cpu_norm.mean= mean(cpu_norm),
    cpu_norm.sd= sd(cpu_norm),
    cpu_norm.ci_low = CI( cpu_norm, ci= .95 )[3],
    cpu_norm.ci_high = CI( cpu_norm, ci= .95 )[1],

    cpu_norm_norm.mean= mean(cpu_norm_norm),
    cpu_norm_norm.sd= sd(cpu_norm_norm),
    cpu_norm_norm.ci_low = CI( cpu_norm_norm, ci= .95 )[3],
    cpu_norm_norm.ci_high = CI( cpu_norm_norm, ci= .95 )[1],

    wall.mean= mean(wall),
    wall.sd= sd(wall),
    wall.ci_low = CI( wall, ci= .95 )[3],
    wall.ci_high = CI( wall, ci= .95 )[1],

    wall_norm.mean= mean(wall_norm),
    wall_norm.sd= sd(wall_norm),
    wall_norm.ci_low = CI( wall_norm, ci= .95 )[3],
    wall_norm.ci_high = CI( wall_norm, ci= .95 )[1],

    wall_naive_norm.mean= mean(wall_naive_norm),
    wall_naive_norm.sd= sd(wall_norm),
    wall_naive_norm.ci_low = CI( wall_naive_norm, ci= .95 )[3],
    wall_naive_norm.ci_high = CI( wall_naive_norm, ci= .95 )[1],

    cpuf=cpuf[1]    

),by=list(model)]

Check cpu normalization with controlled cpu-bound benchmark

load("~/afs/R/R_Repo/analysis/normalization/lsf-bench.RData")

Compare normalized and “raw” peaks for cpu

Display the influence of the cpufactor on the cpu distribution

ggplot(bench)+geom_density(aes(x=b.m, colour=as.factor(cpuf) )  ) +geom_density(aes(x=b.m)  ) 

plot of chunk unnamed-chunk-8

Compare “raw” cpu time and normalized cpu time (cpu*cpuf)

boxplot( formula= b.m ~ cpuf , data=bench, outline=FALSE)

plot of chunk unnamed-chunk-9

boxplot( formula= cpu_norm ~ cpuf , data=bench, outline=FALSE)

plot of chunk unnamed-chunk-9

Compare mean and confidence intervall between models

ggplot(bench.model_direct, aes(x=cpuf, y=cpu.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu.ci_low, ymax=cpu.ci_high), width=.1) 

plot of chunk unnamed-chunk-10

ggplot(bench.model_direct, aes(x=cpuf, y=cpu_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-10

ggplot(bench.model, aes(x=cpuf, y=cpu_norm.mean_mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-10

Check correlation between cpuf and cpu times

cor(bench.model_direct$cpu.mean,bench.model_direct$cpuf)
## [1] -0.8947847
cor(bench.model_direct$cpu_norm.mean,bench.model_direct$cpuf)
## [1] -0.2084638

Coefficient of variance of raw cpu values

print(cv_raw<- sd(bench.model_direct$cpu.mean) /mean(bench.model_direct$cpu.mean)  )
## [1] 0.3090142

Coefficient of variance of normalization is much better

print(cv_norm<- sd(bench.model_direct$cpu_norm.mean)/ mean(bench.model_direct$cpu_norm.mean) )
## [1] 0.1449827

Analyse Normalization of cpu time from log data

Compare normalized and “raw” peaks for cpu

Display the influence of the cpufactor on the cpu distribution

ggplot(data)+geom_density(aes(x=cpu, colour=as.factor(cpuf) )  ) +geom_density(aes(x=cpu)  ) 

plot of chunk unnamed-chunk-14

Compare “raw” cpu time and normalized cpu time (cpu/cpuf)

ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=cpu, colour=model) ) +labs(title=title)

plot of chunk unnamed-chunk-15

ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=cpu_norm, colour=model) ) +labs(title=title)

plot of chunk unnamed-chunk-15

Compare mean and confidence intervall between models

ggplot(data_by_model, aes(x=cpuf, y=cpu.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu.ci_low, ymax=cpu.ci_high), width=.1) 

plot of chunk unnamed-chunk-16

ggplot(data_by_model, aes(x=cpuf, y=cpu_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-16

Check correlation between cpuf and cpu times

cor(data$cpuf,data$cpu)
## [1] -0.6164287
cor(data$cpuf,data$cpu_norm)
## [1] 0.07971474

Coefficient of variance of raw cpu values

print(cv_raw<- sd(data_by_model$cpu.mean) /mean(data_by_model$cpu.mean)  )
## [1] 0.2600274

Coefficient of variance of normalization gets much better

print(cv_norm<- sd(data_by_model$cpu_norm.mean)/ mean(data_by_model$cpu_norm.mean) )
## [1] 0.1068927

Result:

Analyse normalization of cpu time by events

Display the influence of the events on the wall distribution

ggplot(data)+geom_density(aes(x=cpu_norm, colour=as.factor(events) )  ) +geom_density(aes(x=wall)  ) 

plot of chunk unnamed-chunk-20

Compare “raw” normalized cpu time with additional normalization by events

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm) ) +labs(title=title)

plot of chunk unnamed-chunk-21

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm_norm) ) +labs(title=title)

plot of chunk unnamed-chunk-21

Check how many entries we have per events:

xtabs( ~ events, data=data)
## events
##  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28 
##   2   8  13  21  42  61  97 161 201 280 370 467 492 554 580 572 521 536 
##  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  46  48 
## 489 413 356 299 253 204 140 118  78  59  29  25  12   8   4   4   4   1 
##  50 
##   1

Study effect of additional event normalization on distribution between cpu types

ggplot(data_by_model, aes(x=cpuf, y=cpu_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-23

ggplot(data_by_model, aes(x=cpuf, y=cpu_norm_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm_norm.ci_low, ymax=cpu_norm_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-23

Check if we get a nicer looking peak

ggplot(data)+geom_density(aes(x=cpu_norm/mean(cpu_norm),colour="cpu_norm") ) +geom_density(aes(x=(cpu_norm/events)/mean(cpu_norm/events),colour="cpu_norm/events") )   +labs(title=title)

plot of chunk unnamed-chunk-24

Check correlation between cpu and events

cor(data$events,data$cpu_norm)
## [1] 0.7526377
cor(data$events,data$cpu_norm/data$events)
## [1] -0.1051302

Check how confidence interval changes (2 sigma in % of mean value)

2 * sd_norm(data$cpu_norm)
## [1] 0.4816126
2 * sd_norm(data$cpu_norm/data$events)
## [1] 0.3135891

Analyse normalization of wall time

Display the influence of the cpufactor on the wall distribution

ggplot(data)+geom_density(aes(x=wall, colour=as.factor(cpuf) )  ) +geom_density(aes(x=wall)  ) 

plot of chunk unnamed-chunk-27

Compare “raw” wall time, naive normalized wall time (wall/cpuf) and advanced, partial wall normalization (cpu/cpuf + io)

ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=wall, colour=model) ) +ylim(2000,12000) +labs(title=title)

plot of chunk unnamed-chunk-28

ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=wall_naive_norm, colour=model) )+ylim(2000,12000) +labs(title=title)

plot of chunk unnamed-chunk-28

ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=wall_norm, colour=model) )+ylim(2000,12000) +labs(title=title)

plot of chunk unnamed-chunk-28

Compare mean and confidence intervall between models

ggplot(data_by_model, aes(x=cpuf, y=wall.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=wall.ci_low, ymax=wall.ci_high), width=.1) 

plot of chunk unnamed-chunk-29

ggplot(data_by_model, aes(x=cpuf, y=wall_naive_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=wall_naive_norm.ci_low, ymax=wall_naive_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-29

ggplot(data_by_model, aes(x=cpuf, y=wall_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=wall_norm.ci_low, ymax=wall_norm.ci_high), width=.1) 

plot of chunk unnamed-chunk-29

Check correlation between cpuf and wall times

cor(data$wall,data$cpuf)
## [1] -0.5754738
cor(data$wall_naive_norm,data$cpuf)
## [1] 0.5217003
cor(data$wall_norm,data$cpuf)
## [1] 0.03227462

Coefficient of variance of raw wall values

print( sd_norm(data$wall) )
## [1] 0.247516

Coefficient of variance of naive normalization gets better

print( sd_norm(data$wall_naive_norm) )
## [1] 0.212515

Coefficient of variance of advanced normalization (cpu/cpuf + io) gets even better

print( sd_norm(data$wall_norm))
## [1] 0.1984268

Analyse normalization of wall time by events

Display the influence of the events on the wall distribution

ggplot(data)+geom_density(aes(x=wall, colour=as.factor(events) )  ) +geom_density(aes(x=wall)  ) 

plot of chunk unnamed-chunk-34

Compare “raw” normalized wall time with additional normalization by events

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=wall_norm) ) +ylim(0,15000) +labs(title=title)

plot of chunk unnamed-chunk-35

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=wall_norm/events) )+ylim(100,500) +labs(title=title)

plot of chunk unnamed-chunk-35

check cpu and io as well

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm/events) )

plot of chunk unnamed-chunk-36

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=io/events) ) +ylim(0,200)

plot of chunk unnamed-chunk-36

Check again partial normalization of wall time by events

ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm/events +io) ) +ylim(1000,2000)

plot of chunk unnamed-chunk-37

data$wall_norm_norm= data$cpu_norm / data$events + data$io

Check if we get a nice looking peak

ggplot(data)+geom_density(aes(x=wall/mean(wall),colour="wall") ) +geom_density(aes(x=wall_norm/mean(wall_norm),colour="wall_norm") ) +geom_density(aes(x=wall_norm_norm/mean(wall_norm_norm),colour="wall_norm_norm") )  +labs(title=title)

plot of chunk unnamed-chunk-39

Check correlation between events and wall times

cor(data$wall,data$events)
## [1] 0.4616069
cor(data$wall_norm,data$events)
## [1] 0.6989364
cor(data$wall_norm_norm,data$events)
## [1] -0.003498298

Check coefficients of variance

print( 2 * sd_norm(data$wall)  )
## [1] 0.495032
print( 2 * sd_norm(data$wall_norm)  )
## [1] 0.3968536
print( 2 * sd_norm(data$wall_norm_norm)  )
## [1] 0.5317404

Check distribution of double normalized wall

ggplot(data)+geom_density(aes(x=wall_norm_norm,colour="wall_norm_norm") ) +labs(title=title)

plot of chunk unnamed-chunk-44

Try to see what is different about those outliers:

describe_differences(data[wall_norm_norm<2000],data[wall_norm_norm>2000])
##               names       diff_abs diff_sigma
##  1:              io        1242.74      13.71
##  2:  wall_norm_norm        1248.24      12.90
##  3:            wall        1752.66       2.33
##  4: wall_naive_norm        2311.33       1.73
##  5:       wall_norm        1661.44       1.60
##  6:             cpu         441.95       0.61
##  7:           ncpus           2.43       0.47
##  8:        cpu_norm         303.96       0.30
##  9:   cpu_norm_norm           6.62       0.26
## 10:          events           0.68       0.13
## 11:          maxmem -1645763353.32      -0.14
## 12:            cpuf          -0.28      -0.44
## 13:          maxswp -5065220943.78      -0.48
## 14:        cpu_rate          -0.11      -1.49
## 15:   cpu_rate_norm          -0.12      -2.73

Check io relative to normalized wall time

ggplot(data) +geom_point(aes(x=wall_norm_norm,y=io, colour=as.factor(events)) )  +labs(title=title)

plot of chunk unnamed-chunk-46

Re-check coefficient of variance for non-outliers

ggplot(data[wall_norm_norm<2000])+geom_density(aes(x=wall/mean(wall),colour="wall") ) +geom_density(aes(x=wall_norm/mean(wall_norm),colour="wall_norm") ) +geom_density(aes(x=wall_norm_norm/mean(wall_norm_norm),colour="wall_norm_norm") )  +labs(title=title)

plot of chunk unnamed-chunk-47

print( 2 * sd_norm(data[wall_norm_norm<2000]$wall)  )
## [1] 0.4384051
print( 2 * sd_norm(data[wall_norm_norm<2000]$wall_norm)  )
## [1] 0.3731874
print( 2 * sd_norm(data[wall_norm_norm<2000]$wall_norm_norm)  )
## [1] 0.1343652

Check influence of each normalization factor for cpu

#' #' check for full normalization

print( sd_norm(data$cpu_norm_norm) )
## [1] 0.1567945

only events

print( sd_norm(data$cpu / data$events) )
## [1] 0.2771055

only cpuf

print( sd_norm(data$cpu_norm) )
## [1] 0.2408063

no normaliztion

print( sd_norm(data$cpu) )
## [1] 0.3369141

Show in plot

ggplot(data)+geom_density(aes(x=cpu/mean(cpu),colour="cpu") ) +geom_density(aes(x=cpu_norm/mean(cpu_norm),colour="cpu_norm") )+geom_density(aes(x=(cpu/events)/mean(cpu/events),colour="cpu per event") ) +geom_density(aes(x=cpu_norm_norm/mean(cpu_norm_norm),colour="cpu_norm_norm") )  +labs(title=title)

plot of chunk unnamed-chunk-52

Check influence of each normalization factor for wall

Take the dataset without (most of) the outliers

clean= data[wall_norm_norm<2000]

no normalization

print( sd_norm(clean$wall) )
## [1] 0.2192025

only events

print( sd_norm(clean$cpu / clean$events +clean$io) )
## [1] 0.07369879

only cpuf

print( sd_norm(clean$wall_norm) )
## [1] 0.1865937

check for full normalization

print( sd_norm(clean$wall_norm_norm) )
## [1] 0.06718259
ggplot(clean)+geom_density(aes(x=wall/mean(wall),colour="wall") ) +geom_density(aes(x=wall_norm/mean(wall_norm),colour="wall_norm") )+geom_density(aes(x=(cpu/events+io)/mean(cpu/events+io),colour="wall event norm") ) +geom_density(aes(x=wall_norm_norm/mean(wall_norm_norm),colour="wall_norm_norm") )  +labs(title=title)

plot of chunk unnamed-chunk-57

Check correlation between normalized wall time and cpu rate

ggplot(data) +geom_point(aes(y=wall/mean(wall),x=cpu_rate) ,alpha=0.2 )+geom_density(aes(x=cpu_rate,colour="density"))  +labs(title=title)

plot of chunk unnamed-chunk-58

ggplot(data) +geom_point(aes(y=wall_norm/mean(wall_norm),x=cpu_rate_norm) ,alpha=0.2 )+geom_density(aes(x=cpu_rate_norm,colour="density"))  +labs(title=title)

plot of chunk unnamed-chunk-58

ggplot(data) +geom_point(aes(y=wall_norm_norm/mean(wall_norm_norm),x=cpu_rate_norm) ,alpha=0.2 )+geom_density(aes(x=cpu_rate_norm,colour="density"))  +labs(title=title)

plot of chunk unnamed-chunk-58

Chech pearson correlations

cor(data$cpu_rate,data$wall)
## [1] 0.6304693
cor(data$cpu_rate,data$wall_norm_norm)
## [1] -0.2336287
cor(data$cpu_rate_norm,data$wall_norm_norm)
## [1] -0.4633676

Check variance/sd of cpu_rate

2 * sd_norm(data$cpu_rate) 
## [1] 0.2587699
2 * sd_norm(data$cpu_rate_norm) 
## [1] 0.1379436

Conclusions