/** @file sd.cc * * @brief sd - demonstrate the right way to calculate a standard deviation * * @author Manuel Schiller * @date 2022-01-12 * * To compile, use something like this command line: * * @code * g++ -std=c++14 -Wall -Wextra -pedantic -O2 -g -o sqrdiff sqrdiff.cc * @endcode * * Whenever one sums up differences of quantities of different size, one needs * to keep in mind that floating point numbers on a computer do not have a * commutative addition/subtraction, i.e. (a + b) + c is not neccessarily the * same as a + (b + c). * * For example, to calculate x * x - y * y, one should always rewrite this to * the equivalent, but numerically more stable (x + y) * (x - y). For example, * consider x = 1.00 and y = 1.01 (and for illustration's sake, we calculate * with three significant digits in decimal): * * correct result: x * x - y * y = 2.01 * 10^-2 * * naïve calculation: * x * x = 1.00 * 1.00 = 1.00 * y * y = 1.01 * 1.01 = 1.01 + 0.0101 = 1.02 * x * x - y * y = 1.00 - 1.02 = -0.02 * * correct approach: * (x + y) = 2.01 * (x - y) = -0.01 * (x + y) * (x - y) = -2.01 * 10^-2 * * As you can see, the naïve calculation above loses digits because they get * cut off. The same thing happens when using floating point numbers, only one * typically gets a few more digits out before bad things happen (and the * mantissa stores binary digits). * * This kind of problem turns up whenever one subtracts two floating point * numbers, and it gets worse if one has large cancellations. * * When calculating variances or standard deviations, the same kind of problem * can bite the unwary calculator in the loop that calculates the variance or * standard deviation. The naive way to calculate this is in sd_ugly below, * and the code has a huge problem because it counts one the cancellation of * the squared mean in the line marked "*****". * * A much better way is a two-pass algorithm in sd_good which calculates * the mean first, then subtracts it in the loop that calculates the sum for * the variance or standard deviation. See below for an example in code... * * If you absolutely need a single-pass algorithm, there are ways to keep * running averages of both mean and variance or standard deviation, but * please check the literature, and do not use the naïve method in sd_ugly * below! * * Feel free to play with the setting for sd_gen in main() below, * interesting values might be 1e-1, 1e-2, 1e-3, 1e-4, because they'll * illustrate that calculating in double precision does not render the naïve * approach usable, it merely postpones the problem a bit... */ #include #include #include #include #include #include using default_float_type = float; /////////////////////////////////////////////////////////////////////// // routines for unweighted data /////////////////////////////////////////////////////////////////////// template static FLT sd_ugly(const T& arr) { // This is the naïve single-pass approach. It's bad, do not use it! FLT x2 = 0, x = 0; std::size_t n = 0; for (const auto& el: arr) { x += el; x2 += el * el; ++n; } const FLT var = x2 - x * x / n; // ***** DO NOT DO THIS - YOU LOSE PRECISION! return std::sqrt(var / (n - 1)); } template static FLT mean(const T& arr) { FLT x = 0; std::size_t n = 0; for (const auto& el: arr) { x += el; ++n; } return x / n; } template static FLT sd_good(const T& arr) { // this needs two passes over the data - one is hidden inside the call to // mean... const auto x = mean(arr); FLT x2 = 0; std::size_t n = 0; for (const auto& el: arr) { const auto diff = el - x; // lose as little precision as possible x2 += diff * diff; ++n; } return std::sqrt(x2 / (n - 1)); } template static FLT sd_better(const T& arr) { // this needs two passes over the data - one is hidden inside the call to // mean... const auto x = mean(arr); FLT x2 = 0, ex = 0; std::size_t n = 0; for (const auto& el: arr) { const auto diff = el - x; // lose as little precision as possible ex += diff; x2 += diff * diff; ++n; } const auto var = (x2 - ex * ex / n) / (n - 1); return std::sqrt(var); } template static FLT sd_best(const T& arr) { FLT mean = 0, var = 0; std::size_t n = 0; for (const auto& el: arr) { const auto delta = el - mean; mean += delta / (n + 1); var += (el - mean) * delta; ++n; } return std::sqrt(var / (n - 1)); } /////////////////////////////////////////////////////////////////////// // routines for weighted data /////////////////////////////////////////////////////////////////////// template static FLT sd_ugly(const T& arr, const T& weights) { assert(arr.size() == weights.size()); // This is the naïve single-pass approach. It's bad, do not use it! FLT x2 = 0, x = 0, wsum = 0; auto it = weights.cbegin(); for (const auto& el: arr) { const auto& w = *it++; wsum += w; x += w * el; x2 += w * el * el; } const FLT var = x2 - x * x / wsum; // ***** DO NOT DO THIS - YOU LOSE PRECISION! return std::sqrt(var / (wsum - 1)); } template static FLT mean(const T& arr, const T& weights) { assert(arr.size() == weights.size()); FLT x = 0, wsum = 0; auto it = weights.cbegin(); for (const auto& el: arr) { const auto& w = *it++; wsum += w; x += w * el; } return x / wsum; } template static FLT sd_good(const T& arr, const T& weights) { assert(arr.size() == weights.size()); // this needs two passes over the data - one is hidden inside the call to // mean... const auto x = mean(arr); FLT x2 = 0, wsum = 0; auto it = weights.cbegin(); for (const auto& el: arr) { const auto& w = *it++; wsum += w; const auto diff = el - x; // lose as little precision as possible x2 += w * diff * diff; } return std::sqrt(x2 / (wsum - 1)); } template static FLT sd_better(const T& arr, const T& weights) { assert(arr.size() == weights.size()); // this needs two passes over the data - one is hidden inside the call to // mean... const auto x = mean(arr); FLT x2 = 0, ex = 0, wsum = 0; auto it = weights.cbegin(); for (const auto& el: arr) { const auto& w = *it++; wsum += w; const auto diff = el - x; // lose as little precision as possible ex += w * diff; x2 += w * diff * diff; } const auto var = (x2 - ex * ex / wsum) / (wsum - 1); return std::sqrt(var); } template static FLT sd_best(const T& arr, const T& weights) { assert(arr.size() == weights.size()); FLT mean = 0, var = 0, wsum = 0; auto it = weights.cbegin(); for (const auto& el: arr) { const auto& w = *it++; wsum += w; const auto delta = el - mean; mean += (w / wsum) * delta; var += w * (el - mean) * delta; } return std::sqrt(var / (wsum - 1)); } // little helper to calculate standard deviation and print some numbers template static void printmethod(const char* methname, const char* fltname, default_float_type sd_gen, FN method, const ARGS&... args) { static_assert(0 < sizeof...(ARGS) && sizeof...(ARGS) < 3, "printmethod takes one or two arguments"); const auto avg = mean(args...); const auto sigma = method(args...); const auto diff = sigma - sd_gen; std::printf("%6s(%6s) method: mean % 14.7e sigma % 14.7e rel. diff " "% 14.7e\n", methname, fltname, avg, sigma, diff / sd_gen); } int main() { // generate a random sample of 2^20=1024*1024 samples const std::size_t n = 1 << 20; // get samples from a normal distribution with mean 1, sd_gen is 1e-2 // by default, but watch what happens for 1e-3 or 1e-4, for example const default_float_type mean_gen = 1, sd_gen = 1e-2; std::mt19937 gen(static_cast(time(nullptr))); std::normal_distribution gaus(mean_gen, sd_gen); /////////////////////////////////////////////////////////////////// // first test: play with unweighted data /////////////////////////////////////////////////////////////////// // fill data: normally distributed, mean mean_gen, width sd_gen std::vector data, weights; data.reserve(n); weights.reserve(n); for (std::size_t i = 0; n != i; ++i) data.push_back(gaus(gen)); // okay, try the different methods by calculating the sigmas of the sample // with each method, once in float and once in double precision std::printf("testing unweighted data:\n" " generation values: mean % 13.7e sigma % 13.7e\n", mean_gen, sd_gen); printmethod( "ugly", "float", sd_gen, [](const auto& arr) { return sd_ugly(arr); }, data); printmethod( "good", "float", sd_gen, [](const auto& arr) { return sd_good(arr); }, data); printmethod( "better", "float", sd_gen, [](const auto& arr) { return sd_better(arr); }, data); printmethod( "best", "float", sd_gen, [](const auto& arr) { return sd_best(arr); }, data); printmethod( "ugly", "double", sd_gen, [](const auto& arr) { return sd_ugly(arr); }, data); printmethod( "good", "double", sd_gen, [](const auto& arr) { return sd_good(arr); }, data); printmethod( "better", "double", sd_gen, [](const auto& arr) { return sd_better(arr); }, data); printmethod( "best", "double", sd_gen, [](const auto& arr) { return sd_best(arr); }, data); std::printf("\n"); /////////////////////////////////////////////////////////////////// // second test: play with weighted data /////////////////////////////////////////////////////////////////// // weights are flat from [0, 1]; 1/8 of data is background has // sigma 8 times wider, is a little shifted, and is subtracted with // weights by randomly assigning negative weights from [-1, 0] for // half of the wider distribution data.clear(); std::normal_distribution wgaus(mean_gen + sd_gen, 8 * sd_gen); std::uniform_real_distribution wflat( default_float_type(0), default_float_type(1)); for (std::size_t i = 0, j = 0; n != i; ++i) { if (0 == (i & 7)) { data.push_back(gaus(gen)); weights.push_back(int(1 - 2 * (j & 1)) * wflat(gen)); ++j; } else { data.push_back(gaus(gen)); weights.push_back(wflat(gen)); } } // okay, try the different methods by calculating the sigmas of the sample // with each method, once in float and once in double precision std::printf("testing weighted data:\n" " generation values: mean % 13.7e sigma % 13.7e\n", mean_gen, sd_gen); printmethod( "ugly", "float", sd_gen, [](const auto& arr, const auto& w) { return sd_ugly(arr, w); }, data, weights); printmethod( "good", "float", sd_gen, [](const auto& arr, const auto& w) { return sd_good(arr, w); }, data, weights); printmethod( "better", "float", sd_gen, [](const auto& arr, const auto& w) { return sd_better(arr, w); }, data, weights); printmethod( "best", "float", sd_gen, [](const auto& arr, const auto& w) { return sd_best(arr, w); }, data, weights); printmethod( "ugly", "double", sd_gen, [](const auto& arr, const auto& w) { return sd_ugly(arr, w); }, data, weights); printmethod( "good", "double", sd_gen, [](const auto& arr, const auto& w) { return sd_good(arr, w); }, data, weights); printmethod( "better", "double", sd_gen, [](const auto& arr, const auto& w) { return sd_better(arr, w); }, data, weights); printmethod( "best", "double", sd_gen, [](const auto& arr, const auto& w) { return sd_best(arr, w); }, data, weights); return 0; }