Go back to Richel Bilderbeek's homepage.

Go back to Richel Bilderbeek's C++ page.

 

 

 

 

 

(C++) CheckNewick

 

CheckNewick is a Newick code snippets to determine if a std::string is a well-formed Newick.

 

struct BinaryNewickVector
{
  enum { bracket_open  = -1 };
  enum { bracket_close = -2 };
  enum { comma         = -3 };
  enum { new_line      = -4 };
  enum { null          = -5 };
};

///CheckNewick checks if a std::string is a valid Newick.
///If this std::string is not a valid Newick,
///CheckNewick throws an exception with a detailed description
///From http://www.richelbilderbeek.nl/CppCheckNewick.htm
void CheckNewick(const std::string& s)
{
  if (s.size()<3)
    throw std::invalid_argument(
      "The Newick std::string must have a size of at least three characters");
  if (s[0]!='(')
    throw std::invalid_argument(
      "The Newick std::string must start with an opening bracket ('(').");
  if (s[s.size()-1]!=')')
    throw std::invalid_argument(
      "The Newick std::string must end with a closing bracket (')').");
  if (std::count(s.begin(),s.end(),'(')
    !=std::count(s.begin(),s.end(),')'))
    throw std::invalid_argument(
       "The Newick std::string must have as much opening "
       "as closing brackets #1");


  std::string s_copy = s;
  while(s_copy.size()>2) //Find a leaf and cut it until the string is empty
  {
    //Find a leaf
    //Find index i (starting opening bracket) and j (closing bracket)
    const std::size_t sz = s_copy.size();
    std::size_t i = 0;
    std::size_t j = 0;
    for (i=0 ; i!=sz; ++i) //Index of opening bracket
    {
      if (s_copy[i]!='(') continue;
      if (s_copy[i+1]==')')
        throw std::invalid_argument(
          "The Newick std::string cannot have "
          "a consecutive opening and closing bracket");
      for (j=i+1; j!=sz; ++j)
      {
        if (s_copy[j]=='(') { j = 0; break; }
        if (s_copy[j]!=')') continue;
        break;
      }
      if (j ==  0) continue; //j cannot be 0 after previous for loop
      if (j == sz)
        throw std::invalid_argument(
          "The Newick std::string must have as much opening as closing brackets #2");
      break;
    }
    if (s_copy[i]!='(')
      throw std::invalid_argument(
        "The Newick std::string must have as much opening as closing brackets #3");
    //Indices i and j found
    //Is range between i and j valid?
    if (s_copy[i]!='(')
      throw std::logic_error(
        "Bilderbikkel incorrectly assumes that s_copy[i]=='('");
    if (s_copy[j]!=')')
      throw std::logic_error(
        "Bilderbikkel incorrectly assumes that s_copy[j]==')'");
    if (i + 1 != sz && s_copy[i+1]==',')
      throw std::invalid_argument(
        "After bracket open, there can be no comma");
    if (j     !=  0 && s_copy[j-1]==',')
      throw std::invalid_argument(
        "Before bracket close, there can be no comma");
    //Check the range
    for (size_t k=i+1; k!=j; ++k)
    {
      if ( s_copy[k]!='0'
        && s_copy[k]!='1'
        && s_copy[k]!='2'
        && s_copy[k]!='3'
        && s_copy[k]!='4'
        && s_copy[k]!='5'
        && s_copy[k]!='6'
        && s_copy[k]!='7'
        && s_copy[k]!='8'
        && s_copy[k]!='9'
        && s_copy[k]!='0'
        && s_copy[k]!=',')
      {
        std::stringstream err_msg;
        err_msg << "Invalid non-number character in input: '" << s_copy[k] << "'";
        throw std::invalid_argument(err_msg.str().c_str());
      }
      if (k + 1 != sz && s_copy[k] == ',' && s_copy[k+1] == ',')
        throw std::invalid_argument(
          "There can be no consecutive comma's");
    }
    if (i > 0 && s_copy[i-1]=='(' &
      j +1 < sz && s_copy[j + 1] == ')')
    {
      throw std::invalid_argument(
        "Newicks must not have the form ((X))");
    }

    //Range is assumed valid
    //Cut the leaf (turns '(1,2)' to (9) )
    assert(s_copy[i]=='(');
    assert(s_copy[j]==')');
    const std::string s_new_1 = s_copy.substr(0,i);
    const std::string s_new_2 = s_copy.substr(j+1,sz-j-1); //OK
    const std::string s_new =  s_new_1 + "9" + s_new_2;
    s_copy = s_new;
  }
}

///CheckNewick checks if a std::vector<int> is a valid Newick.
///If this std::vector<int> is not a valid Newick,
///CheckNewick throws an exception with a detailed description
///From http://www.richelbilderbeek.nl/CppCheckNewick.htm
void CheckNewick(const std::vector<int>& v)
{
  if (v.size()<3)
    throw std::invalid_argument(
      "The Newick std::vector<int> must have a size of at least three characters");
  if (v[0]!=BinaryNewickVector::bracket_open)
    throw std::invalid_argument(
      "The Newick std::vector<int> must start with an opening bracket ('(').");
  if (v[v.size()-1]!=BinaryNewickVector::bracket_close)
    throw std::invalid_argument(
      "The Newick std::vector<int> must end with a closing bracket (')').");
  if (std::count(v.begin(),v.end(),static_cast<int>(BinaryNewickVector::bracket_open))
    !=std::count(v.begin(),v.end(),static_cast<int>(BinaryNewickVector::bracket_close)))
    throw std::invalid_argument(
       "The Newick std::string must have as much opening "
       "as closing brackets #1");

  std::vector<int> v_copy = v;
  while(v_copy.size()>2) //Find a leaf and cut it until the string is empty
  {
    //Find a leaf
    //Find index i (starting opening bracket) and j (closing bracket)
    const std::size_t sz = v_copy.size();
    std::size_t i = 0;
    std::size_t j = 0;
    for (i=0 ; i!=sz; ++i) //Index of opening bracket
    {
      if (v_copy[i]!=BinaryNewickVector::bracket_open) continue;
      if (v_copy[i+1]==BinaryNewickVector::bracket_close)
        throw std::invalid_argument(
          "The Newick std::vector<int> cannot have "
          "a consecutive opening and closing bracket");
      for (j=i+1; j!=sz; ++j)
      {
        if (v_copy[j]==BinaryNewickVector::bracket_open) { j = 0; break; }
        if (v_copy[j]!=BinaryNewickVector::bracket_close) continue;
        break;
      }
      if (j ==  0) continue; //j cannot be 0 after previous for loop
      if (j == sz)
        throw std::invalid_argument(
          "The Newick std::vector<int> must have as much opening "
          "as closing brackets #2");
      break;
    }
    if (v_copy[i]!=BinaryNewickVector::bracket_open)
      throw std::invalid_argument(
        "The Newick std::vector<int> must have as much opening "
        "as closing brackets #3");
    //Indices i and j found
    //Is range between i and j valid?
    if (v_copy[i]!=BinaryNewickVector::bracket_open)
      throw std::logic_error(
        "Bilderbikkel incorrectly assumes that s_copy[i]=='('");
    if (v_copy[j]!=BinaryNewickVector::bracket_close)
      throw std::logic_error(
        "Bilderbikkel incorrectly assumes that s_copy[j]==')'");
    //Check the range
    for (size_t k=i+1; k!=j; ++k)
    {
      if (v_copy[k] < 0)
      {
        std::ostringstream err_msg;
        err_msg << "Invalid non-number in input: '" << v_copy[k] << "'";
        throw std::invalid_argument(err_msg.str().c_str());
      }
    }
    //Range is assumed valid
    //Cut the leaf
    //Changes '(1,2)' to '(999)'
    assert(v_copy[i]==BinaryNewickVector::bracket_open);
    assert(v_copy[j]==BinaryNewickVector::bracket_close);
    std::vector<int> v_new(v_copy.begin(),v_copy.begin() + i);
    v_new.push_back(999);
    std::copy(v_copy.begin() + j + 1, v_copy.end(),std::back_inserter(v_new));
    v_copy = v_new;
  }
}

 

 

 

 

 

Go back to Richel Bilderbeek's C++ page.

Go back to Richel Bilderbeek's homepage.

 

Valid XHTML 1.0 Strict