Go back to Richel Bilderbeek's homepage.

Go back to Richel Bilderbeek's C++ page.

 

 

 

 

 

(C++) Chi-squared goodness-of-fit to a normal distribution test

 

Chi-squared goodness-of-fit to a normal distribution test is a Math code snippet to perform the same chi-squared goodness-of-fit (to a normal distribution) as [1].

 

#include <algorithm>
#include <cassert>
#include <cmath>
#include <functional>
#include <iterator>
#include <iostream>
#include <numeric>
#include <ostream>
#include <vector>
#include <boost/math/distributions/chi_squared.hpp>

//From http://www.richelbilderbeek.nl/CppGetCumulativeDensityNormal.htm
//Adapted from http://www.ma.ic.ac.uk/~mdavis/course_material/MOP/CumNormDist.txt
double GetCumulativeDensityNormal(const double x)
{
  const double c0 = 0.2316419;
  const double c1 = 1.330274429;
  const double c2 = 1.821255978;
  const double c3 = 1.781477937;
  const double c4 = 0.356563782;
  const double c5 = 0.319381530;
  const double c6 = 0.398942280401;
  const double negative = (x < 0 ? 1.0 : 0.0);
  const double xPos = (x < 0.0 ? -x : x);
  const double k = 1.0 / ( 1.0 + (c0 * xPos));
  const double y1 = (((((((c1*k-c2)*k)+c3)*k)-c4)*k)+c5)*k;
  const double y2 = 1.0 - (c6*std::exp(-0.5*xPos*xPos)*y1);
  return ((1.0-negative)*y2) + (negative*(1.0-y2));
}

//From http://www.richelbilderbeek.nl/CppGetCumulativeDensityNormal.htm
double GetCumulativeDensityNormal(
  const double x,
  const double mean,
  const double stddev)
{
  return GetCumulativeDensityNormal(
    (x-mean) / stddev);
}

//From htpp://www.richelbilderbeek.nl/CppGetMean.htm
double GetMean(const std::vector<double>& v)
{
  return std::accumulate(v.begin(),v.end(),0.0) / static_cast<double>(v.size());
}

template <class T> struct SquareAccumulator
  : public std::binary_function<T,T,T>
{
  const T operator()(const T& sum, const T& x) const
    { return sum+(x*x); }
};

//From http://www.richelbilderbeek.nl/CppGetStdDev.htm
double GetStdDev(const std::vector<double>& v)
{
  assert(v.size() > 1
    && "Can only calculate standard deviations from "
       "data sets with size 2 or larger");
  const double sum_x
    = std::accumulate(v.begin(),v.end(),0.0);
  const double sum_x_squared
    = std::accumulate(v.begin(),v.end(),
        0.0,SquareAccumulator<double>());
  const double sz = static_cast<double>(v.size());
  return std::sqrt(((sz*sum_x_squared)-(sum_x*sum_x))
    / (sz*(sz-1.0)));
}

//Data from:
// * David Heath. An introduction to experimental
// design and statistics for biology. 1995.
// ISBN: 1-85728-132-2 PB. Page 149, box 6.9:
// Chi-squared goodness-of-fit test to a normal
// distribution: body lengths of the
// isopod (Sphaeroma Rugicauda)
//Isopod sizes in millimetres
const std::vector<double> GetIsopodSizes()
{
  std::vector<double> v;
  for (int i=0; i!= 5; ++i) v.push_back(3.65);
  for (int i=0; i!= 9; ++i) v.push_back(3.85);
  for (int i=0; i!=20; ++i) v.push_back(4.05);
  for (int i=0; i!=30; ++i) v.push_back(4.25);
  for (int i=0; i!=29; ++i) v.push_back(4.45);
  for (int i=0; i!=44; ++i) v.push_back(4.65);
  for (int i=0; i!=44; ++i) v.push_back(4.85);
  for (int i=0; i!=69; ++i) v.push_back(5.05);
  for (int i=0; i!=50; ++i) v.push_back(5.25);
  for (int i=0; i!=50; ++i) v.push_back(5.45);
  for (int i=0; i!=29; ++i) v.push_back(5.65);
  for (int i=0; i!=14; ++i) v.push_back(5.85);
  for (int i=0; i!= 4; ++i) v.push_back(6.05);
  for (int i=0; i!= 1; ++i) v.push_back(6.25);
  for (int i=0; i!= 2; ++i) v.push_back(6.45);
  //For fun, let's shuffle these values
  std::random_shuffle(v.begin(),v.end());
  return v;
}

const std::vector<std::pair<double,double> > GetRanges(
  const std::size_t n_categories)
{
  std::vector<std::pair<double,double> > v;
  double lower_limit = 3.45;
  double range_bandwidth = 0.2;
  for (std::size_t i=0; i!=n_categories; ++i)
  {
    v.push_back(std::make_pair(
      lower_limit,lower_limit+range_bandwidth));
    lower_limit+=range_bandwidth;
  }
  return v;
}

const std::vector<int> Tally(
  const std::vector<double>& values,
  const std::vector<std::pair<double,double> >& ranges)
{
  std::vector<int> tally(ranges.size());
  const std::size_t n_values = values.size();
  const std::size_t n_ranges = ranges.size();
  for (std::size_t value_index = 0; value_index!=n_values; ++value_index)
  {
    assert(value_index < values.size());
    const double value = values[value_index];
    for (std::size_t range_index = 0; range_index!=n_ranges; ++range_index)
    {
      const std::pair<double,double> &range = ranges[range_index];
      if ( range.first < value && value < range.second)
      {
        ++tally[range_index];
        break;
      }
    }

  }
  assert( std::accumulate(tally.begin(),tally.end(),0)
    == static_cast<int>(values.size())
    && "Assume all values can be tallied in existing ranges");
  return tally;
}

const std::vector<double> GetTallyExpected(
  const std::vector<std::pair<double,double> >& ranges,
  const double mean,
  const double stdDev,
  const std::size_t n)
{
  const std::size_t sz = ranges.size();
  std::vector<double> v(sz,0.0);
  for (std::size_t i = 0; i!=sz; ++i)
  {
    const double x_left = ranges[i].first;
    const double x_right = ranges[i].second;
    assert(x_left < x_right);
    const double surface_left
      = GetCumulativeDensityNormal(x_left ,mean,stdDev)
      * static_cast<double>(n);
    const double surface_right
      = GetCumulativeDensityNormal(x_right,mean,stdDev)
      * static_cast<double>(n);
    assert(surface_left < surface_right);
    v[i] = surface_right - surface_left;
  }
  return v;
}

const std::vector<double> CalculateRelativeError(
  const std::vector<int>& tally_measured,
  const std::vector<double>& tally_expected)
{
  assert(tally_measured.size() == tally_expected.size());
  const std::size_t sz = tally_measured.size();
  std::vector<double> v(sz);
  for (std::size_t i = 0; i!=sz; ++i)
  {
    const double obs = static_cast<double>(tally_measured[i]);
    const double exp = tally_expected[i];
    assert(exp!=0.0);
    v[i] = ((obs-exp)*(obs-exp))/exp;
  }
  return v;
}

int main()
{
  const std::size_t n_categories = 15;

  const double degrees_of_freedom
    = static_cast<double>(n_categories)
    - 1.0 //We need to calculate the mean ourselves
    - 1.0 //We need to calculate the standard deviation ourselves
    - 1.0; //We need to calculate the sample size ourselves

  const std::vector<double> isopod_sizes = GetIsopodSizes();
  const double mean = GetMean(isopod_sizes);
  const double stdDev = GetStdDev(isopod_sizes);
  const std::vector<std::pair<double,double> > ranges = GetRanges(n_categories);
  assert(n_categories == ranges.size());
  const std::vector<int> tally_measured = Tally(isopod_sizes,ranges);
  const std::vector<double> tally_expected
    = GetTallyExpected(ranges,mean,stdDev,isopod_sizes.size());
  const std::vector<double> rel_error
    = CalculateRelativeError(tally_measured,tally_expected);

  for (std::size_t i=0; i!=n_categories; ++i)
  {
    std::cout
      << ranges[i].first << "-" << ranges[i].second << ":\t"
      << tally_measured[i] << "\t"
      << tally_expected[i] << "\t"
      << rel_error[i] << "\n";
  }

  const double significance_level = 0.05;
  const double chi_squared_value
    = std::accumulate(rel_error.begin(),rel_error.end(),0.0);
  boost::math::chi_squared_distribution<double> distribution(degrees_of_freedom);
  const double critical_value
    = boost::math::quantile(boost::math::complement(distribution, significance_level));
  std::cout
    << "Mean size: " << mean
    << "\nStdDev size: " << stdDev
    << "\nSUM observer: "
      << std::accumulate(tally_measured.begin(),tally_measured.end(), 0)
    << "\nSUM expected: "
      << std::accumulate(tally_expected.begin(),tally_expected.end(),0.0)
    << "\nChi-square value: " << chi_squared_value
    << "\nSignificance level: " << significance_level
    << "\nDegrees of freedom: " << degrees_of_freedom
    << "\nCritical value: " << critical_value << '\n';
  if (chi_squared_value < critical_value)
  {
    std::cout
      << "Cannot reject null hypothesis that the measured values "
         "do follow a normal distribution" << std::endl;
  }
  else
  {
    std::cout
      << "Reject null hypothesis that the measured values "
         "do follow a normal distribution" << std::endl;
  }
}

 

Screen output

 

Starting /MyFolder/MyProject...
3.45-3.65: 5 2.08394 4.08045
3.65-3.85: 9 5.04813 3.09368
3.85-4.05: 20 10.6748 8.14617
4.05-4.25: 30 19.7052 5.37843
4.25-4.45: 29 31.7539 0.238841
4.45-4.65: 44 44.6702 0.0100565
4.65-4.85: 44 54.8586 2.14932
4.85-5.05: 69 58.8136 1.76427
5.05-5.25: 50 55.0452 0.462415
5.25-5.45: 50 44.9747 0.561515
5.45-5.65: 29 32.0791 0.295544
5.65-5.85: 14 19.9747 1.78711
5.85-6.05: 4 10.8576 4.33126
6.05-6.25: 1 5.15205 3.34615
6.25-6.45: 2 2.13408 0.00842393
Mean size: 4.9525
StdDev size: 0.539557
SUM observer: 400
SUM expected: 397.826
Chi-square value: 35.6536
Significance level: 0.05
Degrees of freedom: 12
Critical value: 21.0261
Reject null hypothesis that the measured values do follow a normal distribution
/MyFolder/MyProject exited with code 0

 

Note that I draw a different conclusion than [1]. This is probably due that I did not have the original body sizes, but recreated these from a tally.

 

 

 

 

 

References

 

  1. David Heath. An introduction to experimental design and statistics for biology. 1995. ISBN: 1-85728-132-2 PB. Page 149, box 6.9: Chi-squared goodness-of-fit test to a normal distribution: body lengths of the isopod (Sphaeroma Rugicauda)

 

 

 

 

 

Go back to Richel Bilderbeek's C++ page.

Go back to Richel Bilderbeek's homepage.

 

Valid XHTML 1.0 Strict