#include <cassert>
#include <deque>
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>
#include <boost/numeric/conversion/cast.hpp>
#include <boost/foreach.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/regex.hpp>
///NewickVector contains all std::vector<int> constants
struct NewickVector
{
enum { bracket_open = -1 };
enum { bracket_close = -2 };
enum { comma = -3 };
enum { new_line = -4 };
enum { null = -5 };
};
///CheckNewick checks if a std::string is a valid Newick.
///If this std::string is not a valid Newick,
///CheckNewick throws an exception with a detailed description
///From http://www.richelbilderbeek.nl/CppCheckNewick.htm
void CheckNewick(const std::string& s)
{
if (s.size()<3)
throw std::invalid_argument(
"The Newick std::string must have a size of at least three characters");
if (s[0]!='(')
throw std::invalid_argument(
"The Newick std::string must start with an opening bracket ('(').");
if (s[s.size()-1]!=')')
throw std::invalid_argument(
"The Newick std::string must end with a closing bracket (')').");
std::string s_copy = s;
while(!s_copy.empty()) //Find a leaf and cut it until the string is empty
{
//Find a leaf
//Find index i (starting opening bracket) and j (closing bracket)
const std::size_t sz = s_copy.size();
std::size_t i = 0;
std::size_t j = 0;
for (i=0 ; i!=sz; ++i) //Index of opening bracket
{
if (s_copy[i]!='(') continue;
for (j=i+1; j!=sz; ++j)
{
if (s_copy[j]=='(') { j = 0; break; }
if (s_copy[j]!=')') continue;
break;
}
if (j == 0) continue; //j cannot be 0 after previous for loop
if (j == sz)
throw std::invalid_argument(
"The Newick std::string must have as much opening as closing brackets.");
break;
}
if (s_copy[i]!='(')
throw std::invalid_argument(
"The Newick std::string must have as much opening as closing brackets.");
//Indices i and j found
//Is range between i and j valid?
if (s_copy[i]!='(')
throw std::logic_error(
"Bilderbikkel incorrectly assumes that s_copy[i]=='('");
if (s_copy[j]!=')')
throw std::logic_error(
"Bilderbikkel incorrectly assumes that s_copy[j]==')'");
if (i + 1 != sz && s_copy[i+1]==',')
throw std::invalid_argument(
"After bracket open, there can be no comma");
if (j != 0 && s_copy[j-1]==',')
throw std::invalid_argument(
"Before bracket close, there can be no comma");
//Check the range
for (size_t k=i+1; k!=j; ++k)
{
if ( s_copy[k]!='0'
&& s_copy[k]!='1'
&& s_copy[k]!='2'
&& s_copy[k]!='3'
&& s_copy[k]!='4'
&& s_copy[k]!='5'
&& s_copy[k]!='6'
&& s_copy[k]!='7'
&& s_copy[k]!='8'
&& s_copy[k]!='9'
&& s_copy[k]!='0'
&& s_copy[k]!=',')
{
std::stringstream err_msg;
err_msg << "Invalid non-number character in input: '" << s_copy[k] << "'";
throw std::invalid_argument(err_msg.str().c_str());
}
if (k + 1 != sz && s_copy[k] == ',' && s_copy[k+1] == ',')
throw std::invalid_argument(
"There can be no consecutive comma's");
}
//Range is assumed valid
//Cut the leaf
const size_t copy_to = (i == 0 || s_copy[i-1]!=',' ? i : i - 1);
const size_t copy_from = (j + 2 >= sz || s_copy[j+1]!=',' ? j + 1: j + 2);
const std::string s_new_1 = s_copy.substr(0,copy_to);
const std::string s_new_2 = s_copy.substr(copy_from,sz-copy_from); //OK
const std::string s_new = s_new_1 + s_new_2;
s_copy = s_new;
}
}
///IsNewick returns true if a std::string is a valid Newick
///and false otherwise.
///From http://www.richelbilderbeek.nl/CppIsNewick.htm
bool IsNewick(const std::string& s)
{
try
{
CheckNewick(s);
}
catch (...)
{
return false;
}
return true;
}
///NewickToVector converts a Newick std::string to std::vector<int>
///NewickToVector assumes that the input is well-formed and
///has both trailing and tailing brackets.
///The std::vector<int> does not have these anymore, because
///they are useless for any calculation.
///From http://www.richelbilderbeek.nl/CppNewickToVector.htm
const std::vector<int> NewickToVector(const std::string& newick)
{
assert(!newick.empty() && "s must not be empty");
assert(newick[ 0]=='(' && "s must begin with a '('");
assert(newick[newick.size()-1]==')' && "s must end with a ')'");
const std::string s(newick.begin() + 1,newick.end() -1 );
std::vector<int> v;
int value = 0;
BOOST_FOREACH(const char i,s)
{
if (i == '(')
{
if (value!=0) v.push_back(value);
value = 0;
v.push_back(NewickVector::bracket_open);
continue;
}
if (i == ')')
{
if (value!=0) v.push_back(value);
value = 0;
v.push_back(NewickVector::bracket_close);
continue;
}
if (i == ',')
{
if (value!=0) v.push_back(value);
value = 0;
continue;
}
assert(i >= '0' && i <= '9'); //Should be a number
value*=10;
value+=boost::lexical_cast<int>(i);
}
if (value!=0) v.push_back(value);
return v;
}
///SortNewick orders a Newick is such a way
///that all opening brackets are at the left side.
///For example (1,(2,3)) becomes ((2,3),1)
std::string SortNewick(const std::string& newick)
{
assert(IsNewick(newick));
//All leaves are 'cut' by replacing them with an x
std::string s = newick;
std::string n = "";
//Find initial leaf and replace it with x
{
const boost::regex r("\\(\\d+,\\d+\\)");
std::string::const_iterator start = s.begin();
const std::string::const_iterator end = s.end();
boost::match_results<std::string::const_iterator> what;
boost::regex_search(start, end, what, r);
n = what.str();
s = boost::regex_replace(s,r,"x");
}
//When all leaves are cut, s == 'x'
while (s!="x")
{
//Obtain leaf with x
const boost::regex r("(\\(x,\\d+\\))|(\\(\\d+,x\\))");
std::string::const_iterator start = s.begin();
const std::string::const_iterator end = s.end();
boost::match_results<std::string::const_iterator> what;
//Search for inner leaf
boost::regex_search(start, end, what, r);
const std::string l = what.str();
//Search leaf for digit
boost::regex_search(l.begin(), l.end(), what,boost::regex("\\d+"));
//Add digit to n
n = "(" + n + "," + what.str() + ")";
//Replace the leaf by an x
s = boost::regex_replace(s,r,"x");
}
return n;
}
///SortNewick orders a Newick is such a way
///that all opening brackets are at the left side.
///Note that the initial newick must not have brackets
///on both sides: this is useless in any computation.
std::vector<int> SortNewick(const std::vector<int>& newick)
{
assert(newick.size()>2);
assert( (!(newick[0] == NewickVector::bracket_open
&& newick[newick.size()-1] == NewickVector::bracket_open))
&& "Newick must not be enclosed by brackets");
//Find leaf indices
const int sz = boost::numeric_cast<int>(newick.size());
int i = 0;
for ( ; i!=sz-1; ++i) //-1 because i also looks at next index
{
if (newick[i]>0 && newick[i+1]>0) break;
}
assert(i!=sz-1 && "There should be two consecutive values somewhere");
//Add the lead to v
//std::vector<int> v;
std::deque<int> d;
d.push_front(NewickVector::bracket_open);
d.push_back(newick[i]);
d.push_back(newick[i+1]);
d.push_back(NewickVector::bracket_close);
//Start looking for next
for (int distance = 2; ; ++distance)
{
const int value_left = (i + 0 - distance >= 0 ? newick[i + 0 - distance] : NewickVector::null);
const int value_right = (i + 1 + distance < sz ? newick[i + 1 + distance] : NewickVector::null);
if (value_left == NewickVector::null && value_right == NewickVector::null) break;
if (value_left < 0 && value_right < 0) continue;
d.push_front(NewickVector::bracket_open);
d.push_back(std::max(value_left,value_right));
d.push_back(NewickVector::bracket_close);
}
//Remove brackets at edges
d.pop_front();
d.pop_back();
//Convert std::deque to std::vector
std::vector<int> v(d.begin(),d.end());
return v;
}
int main()
{
//Just a Newick...
const std::string s = "(111,(((1,(2,3)),4),9))";
//Way #1: first sort the std::string, then convert it to std::vector<int>
const std::string t = SortNewick(s);
const std::vector<int> u = NewickToVector(t);
//Way #2: first convert the std::string to std::vector<int>, then sort it
const std::vector<int> v = NewickToVector(s);
const std::vector<int> w = SortNewick(v);
//Either way, they must be identical
assert(u == w);
}
|