Go back to Richel Bilderbeek's homepage.

Go back to Richel Bilderbeek's C++ page.

 

 

 

 

 

(C++) SortNewick

 

SortNewick is a Newick code snippets to sort binary-tree Newick in such a way that all opening brackets are at the right.

 

#include <cassert>
#include <deque>
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>
#include <boost/numeric/conversion/cast.hpp>
#include <boost/foreach.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/regex.hpp>

///NewickVector contains all std::vector<int> constants
struct NewickVector
{
  enum { bracket_open  = -1 };
  enum { bracket_close = -2 };
  enum { comma         = -3 };
  enum { new_line      = -4 };
  enum { null          = -5 };
};

///CheckNewick checks if a std::string is a valid Newick.
///If this std::string is not a valid Newick,
///CheckNewick throws an exception with a detailed description
///From http://www.richelbilderbeek.nl/CppCheckNewick.htm
void CheckNewick(const std::string& s)
{
  if (s.size()<3)
    throw std::invalid_argument(
      "The Newick std::string must have a size of at least three characters");
  if (s[0]!='(')
    throw std::invalid_argument(
      "The Newick std::string must start with an opening bracket ('(').");
  if (s[s.size()-1]!=')')
    throw std::invalid_argument(
      "The Newick std::string must end with a closing bracket (')').");

  std::string s_copy = s;
  while(!s_copy.empty()) //Find a leaf and cut it until the string is empty
  {
    //Find a leaf
    //Find index i (starting opening bracket) and j (closing bracket)
    const std::size_t sz = s_copy.size();
    std::size_t i = 0;
    std::size_t j = 0;
    for (i=0 ; i!=sz; ++i) //Index of opening bracket
    {
      if (s_copy[i]!='(') continue;
      for (j=i+1; j!=sz; ++j)
      {
        if (s_copy[j]=='(') { j = 0; break; }
        if (s_copy[j]!=')') continue;
        break;
      }
      if (j ==  0) continue; //j cannot be 0 after previous for loop
      if (j == sz)
        throw std::invalid_argument(
          "The Newick std::string must have as much opening as closing brackets.");
      break;
    }
    if (s_copy[i]!='(')
      throw std::invalid_argument(
        "The Newick std::string must have as much opening as closing brackets.");
    //Indices i and j found
    //Is range between i and j valid?
    if (s_copy[i]!='(')
      throw std::logic_error(
        "Bilderbikkel incorrectly assumes that s_copy[i]=='('");
    if (s_copy[j]!=')')
      throw std::logic_error(
        "Bilderbikkel incorrectly assumes that s_copy[j]==')'");
    if (i + 1 != sz && s_copy[i+1]==',')
      throw std::invalid_argument(
        "After bracket open, there can be no comma");
    if (j     !=  0 && s_copy[j-1]==',')
      throw std::invalid_argument(
        "Before bracket close, there can be no comma");
    //Check the range
    for (size_t k=i+1; k!=j; ++k)
    {
      if ( s_copy[k]!='0'
        && s_copy[k]!='1'
        && s_copy[k]!='2'
        && s_copy[k]!='3'
        && s_copy[k]!='4'
        && s_copy[k]!='5'
        && s_copy[k]!='6'
        && s_copy[k]!='7'
        && s_copy[k]!='8'
        && s_copy[k]!='9'
        && s_copy[k]!='0'
        && s_copy[k]!=',')
      {
        std::stringstream err_msg;
        err_msg << "Invalid non-number character in input: '" << s_copy[k] << "'";
        throw std::invalid_argument(err_msg.str().c_str());
      }
      if (k + 1 != sz && s_copy[k] == ',' && s_copy[k+1] == ',')
        throw std::invalid_argument(
          "There can be no consecutive comma's");
    }
    //Range is assumed valid
    //Cut the leaf
    const size_t copy_to = (i == 0 || s_copy[i-1]!=',' ? i : i - 1);
    const size_t copy_from = (j + 2 >= sz  || s_copy[j+1]!=',' ? j + 1: j + 2);
    const std::string s_new_1 = s_copy.substr(0,copy_to);
    const std::string s_new_2 = s_copy.substr(copy_from,sz-copy_from); //OK
    const std::string s_new =  s_new_1 + s_new_2;
    s_copy = s_new;
  }
}

///IsNewick returns true if a std::string is a valid Newick
///and false otherwise.
///From http://www.richelbilderbeek.nl/CppIsNewick.htm
bool IsNewick(const std::string& s)
{
  try
  {
    CheckNewick(s);
  }
  catch (...)
  {
    return false;
  }
  return true;
}

///NewickToVector converts a Newick std::string to std::vector<int>
///NewickToVector assumes that the input is well-formed and
///has both trailing and tailing brackets.
///The std::vector<int> does not have these anymore, because
///they are useless for any calculation.
///From http://www.richelbilderbeek.nl/CppNewickToVector.htm
const std::vector<int> NewickToVector(const std::string& newick)
{
  assert(!newick.empty()              && "s must not be empty");
  assert(newick[              0]=='(' && "s must begin with a '('");
  assert(newick[newick.size()-1]==')' && "s must end with a ')'");

  const std::string s(newick.begin() + 1,newick.end() -1 );

  std::vector<int> v;
  int value = 0;

  BOOST_FOREACH(const char i,s)
  {
    if (i == '(')
    {
      if (value!=0) v.push_back(value);
      value = 0;
      v.push_back(NewickVector::bracket_open);
      continue;
    }
    if (i == ')')
    {
      if (value!=0) v.push_back(value);
      value = 0;
      v.push_back(NewickVector::bracket_close);
      continue;
    }
    if (i == ',')
    {
      if (value!=0) v.push_back(value);
      value = 0;
      continue;
    }
    assert(i >= '0' && i <= '9'); //Should be a number
    value*=10;
    value+=boost::lexical_cast<int>(i);
  }
  if (value!=0) v.push_back(value);

  return v;
}

///SortNewick orders a Newick is such a way
///that all opening brackets are at the left side.
///For example (1,(2,3)) becomes ((2,3),1)
std::string SortNewick(const std::string& newick)
{
  assert(IsNewick(newick));
  //All leaves are 'cut' by replacing them with an x
  std::string s = newick;
  std::string n = "";
  //Find initial leaf and replace it with x
  {
    const boost::regex r("\\(\\d+,\\d+\\)");
    std::string::const_iterator start = s.begin();
    const std::string::const_iterator end = s.end();
    boost::match_results<std::string::const_iterator> what;
    boost::regex_search(start, end, what, r);
    n = what.str();
    s = boost::regex_replace(s,r,"x");
  }
  //When all leaves are cut, s == 'x'
  while (s!="x")
  {
    //Obtain leaf with x
    const boost::regex r("(\\(x,\\d+\\))|(\\(\\d+,x\\))");
    std::string::const_iterator start = s.begin();
    const std::string::const_iterator end = s.end();
    boost::match_results<std::string::const_iterator> what;
    //Search for inner leaf
    boost::regex_search(start, end, what, r);
    const std::string l = what.str();
    //Search leaf for digit
    boost::regex_search(l.begin(), l.end(), what,boost::regex("\\d+"));
    //Add digit to n
    n = "(" + n + "," + what.str() + ")";
    //Replace the leaf by an x
    s = boost::regex_replace(s,r,"x");
  }
  return n;
}

///SortNewick orders a Newick is such a way
///that all opening brackets are at the left side.
///Note that the initial newick must not have brackets
///on both sides: this is useless in any computation.
std::vector<int> SortNewick(const std::vector<int>& newick)
{
  assert(newick.size()>2);
  assert( (!(newick[0] == NewickVector::bracket_open
    && newick[newick.size()-1] == NewickVector::bracket_open))
    && "Newick must not be enclosed by brackets");
  //Find leaf indices
  const int sz = boost::numeric_cast<int>(newick.size());
  int i = 0;
  for ( ; i!=sz-1; ++i) //-1 because i also looks at next index
  {
    if (newick[i]>0 && newick[i+1]>0) break;
  }
  assert(i!=sz-1 && "There should be two consecutive values somewhere");
  //Add the lead to v
  //std::vector<int> v;
  std::deque<int> d;
  d.push_front(NewickVector::bracket_open);
  d.push_back(newick[i]);
  d.push_back(newick[i+1]);
  d.push_back(NewickVector::bracket_close);
  //Start looking for next
  for (int distance = 2; ; ++distance)
  {
    const int value_left  = (i + 0 - distance >=  0 ? newick[i + 0 - distance] : NewickVector::null);
    const int value_right = (i + 1 + distance  < sz ? newick[i + 1 + distance] : NewickVector::null);
    if (value_left == NewickVector::null && value_right == NewickVector::null) break;
    if (value_left < 0 && value_right < 0) continue;
    d.push_front(NewickVector::bracket_open);
    d.push_back(std::max(value_left,value_right));
    d.push_back(NewickVector::bracket_close);
  }
  //Remove brackets at edges
  d.pop_front();
  d.pop_back();
  //Convert std::deque to std::vector
  std::vector<int> v(d.begin(),d.end());
  return v;
}

int main()
{
  //Just a Newick...
  const std::string s = "(111,(((1,(2,3)),4),9))";

  //Way #1: first sort the std::string, then convert it to std::vector<int>
  const std::string t = SortNewick(s);
  const std::vector<int> u = NewickToVector(t);

  //Way #2: first convert the std::string to std::vector<int>, then sort it
  const std::vector<int> v = NewickToVector(s);
  const std::vector<int> w = SortNewick(v);

  //Either way, they must be identical
  assert(u == w);
}

 

 

 

 

 

Go back to Richel Bilderbeek's C++ page.

Go back to Richel Bilderbeek's homepage.

 

Valid XHTML 1.0 Strict