Page Constraint Iterator

Example for using node predicates on a deferred suffix tree.

Given a sequences, we want to find all substrings s that fulfill certain constraints. The relative probabilty to see s should be at least p_min. s should also be not longer than replen_max. The latter constraint is a anti-monotonic pattern predicate and can be used in conjunction with the first constraint to cut of the trunk of a suffix tree. Only the top of the suffix tree contains candidates that might fulfill both predicates, so we can use an Index based on a deferred suffix tree (see IndexWotd). The following example demonstrates how to iterate over all suffix tree nodes fulfilling the constraints and output them.

We start by including the necessary headers and use the namespace seqan;

#include <iostream>
#include <seqan/index.h>

using namespace seqan;

Afterwards we create a struct containing the constraints.

struct TMyConstraints
{
    double p_min;
    unsigned int replen_max;
    bool _cachedPred;
};

In the following we do the requiered extensions.

namespace seqan {
// custom TSpec for our customized wotd-Index
struct TMyIndex;

template <typename TText>
struct Cargo<Index<TText, IndexWotd<TMyIndex> > >
{
    typedef TMyConstraints Type;
};

// node predicate
template <typename TText, typename TSpec>
bool nodePredicate(Iter<Index<TText, IndexWotd<TMyIndex> >, TSpec> & it)
{
    TMyConstraints & cons = cargo(container(it));
    unsigned int delta = countSequences(container(it)) * repLength(it);
    unsigned int textLen = length(container(it));

    if (textLen <= delta)
        return cons._cachedPred = true;

    return cons._cachedPred =
        ((double)countOccurrences(it) / (double)(textLen - delta)) > cons.p_min;
}

// monotonic hull
template <typename TText, typename TSpec>
bool nodeHullPredicate(Iter<Index<TText, IndexWotd<TMyIndex> >, TSpec> & it)
{
    TMyConstraints const & cons = cargo(container(it));
    unsigned int textLen = length(container(it));

    if (repLength(it) > cons.replen_max)
        return false;

    unsigned int delta = countSequences(container(it)) * cons.replen_max;
    if (textLen <= delta)
        return true;

    return ((double)countOccurrences(it) /
            (double)(textLen - delta)) > cons.p_min;
}

}

Now we start the main program with a sequence initialization.

int main()
{
    String<char> myString = "How many wood would a woodchuck chuck.";

Then we create our customized index which is a specialization of the deferred @Class.Index.wotd-Index@

    typedef Index<String<char>, IndexWotd<TMyIndex> > TMyIndex;
    TMyIndex myIndex(myString);

    cargo(myIndex).replen_max = 10;
    cargo(myIndex).p_min = 0.05;

And finaly we search by traversing a string tree. To find all strings that fulfill our constraints, we simply do a dfs-traversal via @Function.goBegin@ and @Function.goNext@.

    typedef Iterator<TMyIndex, TopDown<ParentLinks<> > >::Type TConstrainedIterator;
    TConstrainedIterator myConstrainedIterator(myIndex);

    goBegin(myConstrainedIterator);
    while (!atEnd(myConstrainedIterator))
    {

        //@Function.countOccurrences@ returns the number of hits of the representative.
        std::cout << countOccurrences(myConstrainedIterator) << "x  ";

        //The representative string can be determined with @Function.representative@
        std::cout << "\t\"" << representative(myConstrainedIterator) << '\"' << std::endl;

        goNext(myConstrainedIterator);
    }

    return 0;
}
weese@tanne:~/seqan$ cd demos
weese@tanne:~/seqan/demos$ make index_node_predicate
weese@tanne:~/seqan/demos$ ./index_node_predicate
38x     ""
6x      " "
3x      " wo"
2x      " wood"
2x      "a"
4x      "c"
2x      "chuck"
2x      "ck"
3x      "d"
2x      "d "
2x      "huck"
2x      "k"
6x      "o"
2x      "od"
2x      "ood"
3x      "u"
2x      "uck"
4x      "w"
3x      "wo"
2x      "wood"
weese@tanne:~/seqan/demos$