Define data
fromDateString="20140801"
toDateString="20140930"
from=ymd(fromDateString)
to=ymd(toDateString)
cacheDataFolder="/afs/cern.ch/project/log_cor/dashb/cached_data"
options("mc.cores" = 6)
dsDashboard = DashboardDataSource$new(from,to,experiment,
cacheDataFolder=cacheDataFolder,
useOldCache=TRUE,
createDailyCache=TRUE,
createAggregatedCache=TRUE)
## DashboardDataSource loading data
## [1] "Loading data from aggregated cache"
## [1] "time to load all:"
## user system elapsed
## 2.691 0.162 2.856
## loaded: 599944.33 entries/s
dash=dsDashboard$entries
prettyNum(nrow(dash),big.mark=",")
## [1] "1,713,441"
Clean data by removing results that are not interesting
dash= dash[events>0]
dash= dash[cpu>0]
dash= dash[wall>0]
prettyNum(nrow(dash),big.mark=",")
## [1] "599,038"
if(experiment =="atlas"){
dash[inputfiletype==""]$inputfiletype= "NO_ENTRY"
}
dash= clean.factors(dash , clear.blank = FALSE, drop.levels = TRUE, verbose=FALSE )
joined= addLshost2Dashboard(dash)
prettyNum(nrow(joined),big.mark=",")
## [1] "400,349"
Pre-calculate rates
joined$io = joined$wall - joined$cpu
joined$cpu_norm= (joined$cpu * joined$cpuf)
joined$cpu_norm_norm= (joined$cpu_norm / joined$events)
joined$wall_norm = joined$io + joined$cpu_norm
joined$wall_naive_norm= joined$wall * joined$cpuf
joined$wall_norm_norm = joined$io + joined$cpu_norm/joined$events
joined$cpu_rate= joined$cpu / joined$wall
joined$cpu_rate_norm= joined$cpu_norm /joined$wall_norm
Get ranking of large groups
if(experiment=="atlas"){
ranking = get_group_rankings(joined,"inputfileproject",5000)
data= joined[inputfileproject %in% (ranking$inputfileproject)]
title=paste0(experiment," - ",min(data$startDashboard)," to ",max(data$end.time)," ",prettyNum(nrow(data),big.mark=",")," items")
#' check for a good test group
ggplot(data)+geom_density(aes(x=cpu,colour=inputfileproject) ) +labs(title=title)
#' Select group mc12_8Tev (biggest group with a concise peak)
data= joined[inputfileproject=="mc12_8TeV"]
#' Display the influence of the cpufactor on the cpu distribution
ggplot(data)+geom_density(aes(x=cpu, colour=as.factor(cpuf) ) ) +geom_density(aes(x=cpu) )
} else if(experiment =="cms"){
ranking = get_group_rankings(joined,"taskname",5000)
data= joined[taskname %in% (ranking$taskname)]
title=paste0(experiment," - ",min(data$startDashboard)," to ",max(data$finished)," ",prettyNum(nrow(data),big.mark=",")," items")
#' check for a good test group
print(ggplot(data)+geom_density(aes(x=cpu,colour=taskname) ) +labs(title=title) )
#' Select group wmagent_amaltaro_FSQ-ppSpring2014-00007_ForceCompletion_140828_201751_9624 (big group with a concise peak)
data= joined[taskname=="wmagent_amaltaro_FSQ-ppSpring2014-00007_ForceCompletion_140828_201751_9624"]
#' Display the influence of io on these jobs
print( ggplot(data)+geom_density(aes(x=cpu_rate,colour=taskname) ) +labs(title=title) )
#' * There is a reasonable io part of about ~30% on average
#' * This means we can check the effect of wall normalization, compared to cpu normalization
}
aggregate by model and inspect deviation between models
data_by_model= data[, list(
cpu.mean= mean(cpu),
cpu.sd= sd(cpu),
cpu.ci_low = CI( cpu, ci= .95 )[3],
cpu.ci_high = CI( cpu, ci= .95 )[1],
cpu_norm.mean= mean(cpu_norm),
cpu_norm.sd= sd(cpu_norm),
cpu_norm.ci_low = CI( cpu_norm, ci= .95 )[3],
cpu_norm.ci_high = CI( cpu_norm, ci= .95 )[1],
cpu_norm_norm.mean= mean(cpu_norm_norm),
cpu_norm_norm.sd= sd(cpu_norm_norm),
cpu_norm_norm.ci_low = CI( cpu_norm_norm, ci= .95 )[3],
cpu_norm_norm.ci_high = CI( cpu_norm_norm, ci= .95 )[1],
wall.mean= mean(wall),
wall.sd= sd(wall),
wall.ci_low = CI( wall, ci= .95 )[3],
wall.ci_high = CI( wall, ci= .95 )[1],
wall_norm.mean= mean(wall_norm),
wall_norm.sd= sd(wall_norm),
wall_norm.ci_low = CI( wall_norm, ci= .95 )[3],
wall_norm.ci_high = CI( wall_norm, ci= .95 )[1],
wall_naive_norm.mean= mean(wall_naive_norm),
wall_naive_norm.sd= sd(wall_norm),
wall_naive_norm.ci_low = CI( wall_naive_norm, ci= .95 )[3],
wall_naive_norm.ci_high = CI( wall_naive_norm, ci= .95 )[1],
cpuf=cpuf[1]
),by=list(model)]
load("~/afs/R/R_Repo/analysis/normalization/lsf-bench.RData")
Compare normalized and “raw” peaks for cpu
Display the influence of the cpufactor on the cpu distribution
ggplot(bench)+geom_density(aes(x=b.m, colour=as.factor(cpuf) ) ) +geom_density(aes(x=b.m) )
Compare “raw” cpu time and normalized cpu time (cpu*cpuf)
boxplot( formula= b.m ~ cpuf , data=bench, outline=FALSE)
boxplot( formula= cpu_norm ~ cpuf , data=bench, outline=FALSE)
Compare mean and confidence intervall between models
ggplot(bench.model_direct, aes(x=cpuf, y=cpu.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu.ci_low, ymax=cpu.ci_high), width=.1)
ggplot(bench.model_direct, aes(x=cpuf, y=cpu_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1)
ggplot(bench.model, aes(x=cpuf, y=cpu_norm.mean_mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1)
Check correlation between cpuf and cpu times
cor(bench.model_direct$cpu.mean,bench.model_direct$cpuf)
## [1] -0.8947847
cor(bench.model_direct$cpu_norm.mean,bench.model_direct$cpuf)
## [1] -0.2084638
Coefficient of variance of raw cpu values
print(cv_raw<- sd(bench.model_direct$cpu.mean) /mean(bench.model_direct$cpu.mean) )
## [1] 0.3090142
Coefficient of variance of normalization is much better
print(cv_norm<- sd(bench.model_direct$cpu_norm.mean)/ mean(bench.model_direct$cpu_norm.mean) )
## [1] 0.1449827
Compare normalized and “raw” peaks for cpu
Display the influence of the cpufactor on the cpu distribution
ggplot(data)+geom_density(aes(x=cpu, colour=as.factor(cpuf) ) ) +geom_density(aes(x=cpu) )
Compare “raw” cpu time and normalized cpu time (cpu/cpuf)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=cpu, colour=model) ) +labs(title=title)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=cpu_norm, colour=model) ) +labs(title=title)
Compare mean and confidence intervall between models
ggplot(data_by_model, aes(x=cpuf, y=cpu.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu.ci_low, ymax=cpu.ci_high), width=.1)
ggplot(data_by_model, aes(x=cpuf, y=cpu_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1)
Check correlation between cpuf and cpu times
cor(data$cpuf,data$cpu)
## [1] -0.6164287
cor(data$cpuf,data$cpu_norm)
## [1] 0.07971474
Coefficient of variance of raw cpu values
print(cv_raw<- sd(data_by_model$cpu.mean) /mean(data_by_model$cpu.mean) )
## [1] 0.2600274
Coefficient of variance of normalization gets much better
print(cv_norm<- sd(data_by_model$cpu_norm.mean)/ mean(data_by_model$cpu_norm.mean) )
## [1] 0.1068927
Result:
Display the influence of the events on the wall distribution
ggplot(data)+geom_density(aes(x=cpu_norm, colour=as.factor(events) ) ) +geom_density(aes(x=wall) )
Compare “raw” normalized cpu time with additional normalization by events
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm) ) +labs(title=title)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm_norm) ) +labs(title=title)
Check how many entries we have per events:
xtabs( ~ events, data=data)
## events
## 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
## 2 8 13 21 42 61 97 161 201 280 370 467 492 554 580 572 521 536
## 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 46 48
## 489 413 356 299 253 204 140 118 78 59 29 25 12 8 4 4 4 1
## 50
## 1
Study effect of additional event normalization on distribution between cpu types
ggplot(data_by_model, aes(x=cpuf, y=cpu_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm.ci_low, ymax=cpu_norm.ci_high), width=.1)
ggplot(data_by_model, aes(x=cpuf, y=cpu_norm_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=cpu_norm_norm.ci_low, ymax=cpu_norm_norm.ci_high), width=.1)
Check if we get a nicer looking peak
ggplot(data)+geom_density(aes(x=cpu_norm/mean(cpu_norm),colour="cpu_norm") ) +geom_density(aes(x=(cpu_norm/events)/mean(cpu_norm/events),colour="cpu_norm/events") ) +labs(title=title)
cor(data$events,data$cpu_norm)
## [1] 0.7526377
cor(data$events,data$cpu_norm/data$events)
## [1] -0.1051302
2 * sd_norm(data$cpu_norm)
## [1] 0.4816126
2 * sd_norm(data$cpu_norm/data$events)
## [1] 0.3135891
Display the influence of the cpufactor on the wall distribution
ggplot(data)+geom_density(aes(x=wall, colour=as.factor(cpuf) ) ) +geom_density(aes(x=wall) )
Compare “raw” wall time, naive normalized wall time (wall/cpuf) and advanced, partial wall normalization (cpu/cpuf + io)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=wall, colour=model) ) +ylim(2000,12000) +labs(title=title)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=wall_naive_norm, colour=model) )+ylim(2000,12000) +labs(title=title)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(cpuf), y=wall_norm, colour=model) )+ylim(2000,12000) +labs(title=title)
Compare mean and confidence intervall between models
ggplot(data_by_model, aes(x=cpuf, y=wall.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=wall.ci_low, ymax=wall.ci_high), width=.1)
ggplot(data_by_model, aes(x=cpuf, y=wall_naive_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=wall_naive_norm.ci_low, ymax=wall_naive_norm.ci_high), width=.1)
ggplot(data_by_model, aes(x=cpuf, y=wall_norm.mean, fill=model)) +geom_bar(position=position_dodge(), stat="identity") + geom_errorbar(aes(ymin=wall_norm.ci_low, ymax=wall_norm.ci_high), width=.1)
Check correlation between cpuf and wall times
cor(data$wall,data$cpuf)
## [1] -0.5754738
cor(data$wall_naive_norm,data$cpuf)
## [1] 0.5217003
cor(data$wall_norm,data$cpuf)
## [1] 0.03227462
Coefficient of variance of raw wall values
print( sd_norm(data$wall) )
## [1] 0.247516
Coefficient of variance of naive normalization gets better
print( sd_norm(data$wall_naive_norm) )
## [1] 0.212515
Coefficient of variance of advanced normalization (cpu/cpuf + io) gets even better
print( sd_norm(data$wall_norm))
## [1] 0.1984268
Display the influence of the events on the wall distribution
ggplot(data)+geom_density(aes(x=wall, colour=as.factor(events) ) ) +geom_density(aes(x=wall) )
Compare “raw” normalized wall time with additional normalization by events
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=wall_norm) ) +ylim(0,15000) +labs(title=title)
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=wall_norm/events) )+ylim(100,500) +labs(title=title)
check cpu and io as well
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm/events) )
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=io/events) ) +ylim(0,200)
Check again partial normalization of wall time by events
ggplot(data=data)+geom_boxplot(aes(x=as.factor(events), y=cpu_norm/events +io) ) +ylim(1000,2000)
data$wall_norm_norm= data$cpu_norm / data$events + data$io
Check if we get a nice looking peak
ggplot(data)+geom_density(aes(x=wall/mean(wall),colour="wall") ) +geom_density(aes(x=wall_norm/mean(wall_norm),colour="wall_norm") ) +geom_density(aes(x=wall_norm_norm/mean(wall_norm_norm),colour="wall_norm_norm") ) +labs(title=title)
cor(data$wall,data$events)
## [1] 0.4616069
cor(data$wall_norm,data$events)
## [1] 0.6989364
cor(data$wall_norm_norm,data$events)
## [1] -0.003498298
print( 2 * sd_norm(data$wall) )
## [1] 0.495032
print( 2 * sd_norm(data$wall_norm) )
## [1] 0.3968536
print( 2 * sd_norm(data$wall_norm_norm) )
## [1] 0.5317404
Check distribution of double normalized wall
ggplot(data)+geom_density(aes(x=wall_norm_norm,colour="wall_norm_norm") ) +labs(title=title)
Try to see what is different about those outliers:
describe_differences(data[wall_norm_norm<2000],data[wall_norm_norm>2000])
## names diff_abs diff_sigma
## 1: io 1242.74 13.71
## 2: wall_norm_norm 1248.24 12.90
## 3: wall 1752.66 2.33
## 4: wall_naive_norm 2311.33 1.73
## 5: wall_norm 1661.44 1.60
## 6: cpu 441.95 0.61
## 7: ncpus 2.43 0.47
## 8: cpu_norm 303.96 0.30
## 9: cpu_norm_norm 6.62 0.26
## 10: events 0.68 0.13
## 11: maxmem -1645763353.32 -0.14
## 12: cpuf -0.28 -0.44
## 13: maxswp -5065220943.78 -0.48
## 14: cpu_rate -0.11 -1.49
## 15: cpu_rate_norm -0.12 -2.73
Check io relative to normalized wall time
ggplot(data) +geom_point(aes(x=wall_norm_norm,y=io, colour=as.factor(events)) ) +labs(title=title)
Re-check coefficient of variance for non-outliers
ggplot(data[wall_norm_norm<2000])+geom_density(aes(x=wall/mean(wall),colour="wall") ) +geom_density(aes(x=wall_norm/mean(wall_norm),colour="wall_norm") ) +geom_density(aes(x=wall_norm_norm/mean(wall_norm_norm),colour="wall_norm_norm") ) +labs(title=title)
print( 2 * sd_norm(data[wall_norm_norm<2000]$wall) )
## [1] 0.4384051
print( 2 * sd_norm(data[wall_norm_norm<2000]$wall_norm) )
## [1] 0.3731874
print( 2 * sd_norm(data[wall_norm_norm<2000]$wall_norm_norm) )
## [1] 0.1343652
#' #' check for full normalization
print( sd_norm(data$cpu_norm_norm) )
## [1] 0.1567945
only events
print( sd_norm(data$cpu / data$events) )
## [1] 0.2771055
only cpuf
print( sd_norm(data$cpu_norm) )
## [1] 0.2408063
no normaliztion
print( sd_norm(data$cpu) )
## [1] 0.3369141
Show in plot
ggplot(data)+geom_density(aes(x=cpu/mean(cpu),colour="cpu") ) +geom_density(aes(x=cpu_norm/mean(cpu_norm),colour="cpu_norm") )+geom_density(aes(x=(cpu/events)/mean(cpu/events),colour="cpu per event") ) +geom_density(aes(x=cpu_norm_norm/mean(cpu_norm_norm),colour="cpu_norm_norm") ) +labs(title=title)
Take the dataset without (most of) the outliers
clean= data[wall_norm_norm<2000]
no normalization
print( sd_norm(clean$wall) )
## [1] 0.2192025
only events
print( sd_norm(clean$cpu / clean$events +clean$io) )
## [1] 0.07369879
only cpuf
print( sd_norm(clean$wall_norm) )
## [1] 0.1865937
check for full normalization
print( sd_norm(clean$wall_norm_norm) )
## [1] 0.06718259
ggplot(clean)+geom_density(aes(x=wall/mean(wall),colour="wall") ) +geom_density(aes(x=wall_norm/mean(wall_norm),colour="wall_norm") )+geom_density(aes(x=(cpu/events+io)/mean(cpu/events+io),colour="wall event norm") ) +geom_density(aes(x=wall_norm_norm/mean(wall_norm_norm),colour="wall_norm_norm") ) +labs(title=title)
ggplot(data) +geom_point(aes(y=wall/mean(wall),x=cpu_rate) ,alpha=0.2 )+geom_density(aes(x=cpu_rate,colour="density")) +labs(title=title)
ggplot(data) +geom_point(aes(y=wall_norm/mean(wall_norm),x=cpu_rate_norm) ,alpha=0.2 )+geom_density(aes(x=cpu_rate_norm,colour="density")) +labs(title=title)
ggplot(data) +geom_point(aes(y=wall_norm_norm/mean(wall_norm_norm),x=cpu_rate_norm) ,alpha=0.2 )+geom_density(aes(x=cpu_rate_norm,colour="density")) +labs(title=title)
Chech pearson correlations
cor(data$cpu_rate,data$wall)
## [1] 0.6304693
cor(data$cpu_rate,data$wall_norm_norm)
## [1] -0.2336287
cor(data$cpu_rate_norm,data$wall_norm_norm)
## [1] -0.4633676
2 * sd_norm(data$cpu_rate)
## [1] 0.2587699
2 * sd_norm(data$cpu_rate_norm)
## [1] 0.1379436