Example Program
Constraint Iterator
Example for using node predicates on a deferred suffix tree.
Given a sequences, we want to find all substrings s that fulfill certain constraints. The relative probabilty to see s should be at least p_min. s should also be not longer than replen_max. The latter constraint is a anti-monotonic pattern predicate and can be used in conjunction with the first constraint to cut of the trunk of a suffix tree. Only the top of the suffix tree contains candidates that might fulfill both predicates, so we can use an Index based on a deferred suffix tree (see IndexWotd). The following example demonstrates how to iterate over all suffix tree nodes fulfilling the constraints and output them.
A tutorial showing how to extent an index with a node predicate.
1#include <iostream>
2#include <seqan/index.h>
3
4using namespace seqan;
5
constraint parameters
6struct TMyConstraints {
7    double p_min;
8    unsigned int replen_max;
9    bool _cachedPred;
10};
11
SeqAn extensions
12namespace seqan 
13{
14    // custom TSpec for our customized wotd-Index
15    struct TMyIndex;
16
17    template <typename TText>
18    struct Cargo<Index<TText, IndexWotd<TMyIndex> > > {
19        typedef TMyConstraints Type;
20    };
21
22    // node predicate
23    template <typename TText, typename TSpec>
24    bool nodePredicate(Iter<Index<TText, IndexWotd<TMyIndex> >, TSpec> &it) 
25    {
26        TMyConstraints &cons = cargo(container(it));
27        unsigned int delta = countSequences(container(it)) * repLength(it);
28        unsigned int textLen = length(container(it));
29
30        if (textLen <= delta) return cons._cachedPred = true;
31        return cons._cachedPred = 
32            ((double)countOccurrences(it) / (double)(textLen - delta)) > cons.p_min;
33    }
34
35    // monotonic hull
36    template <typename TText, typename TSpec>
37    bool nodeHullPredicate(Iter<Index<TText, IndexWotd<TMyIndex> >, TSpec> &it) 
38    {
39        TMyConstraints const &cons = cargo(container(it));
40        unsigned int textLen = length(container(it));
41
42        if (repLength(it) > cons.replen_max) return false;
43
44        unsigned int delta = countSequences(container(it)) * cons.replen_max;
45        if (textLen <= delta) return true;
46        return ((double)countOccurrences(it) / 
47                (double)(textLen - delta)) > cons.p_min;
48    }
49}
50
51int main ()
52{
We begin with a String to store our sequence.
53    String<char> myString = "How many wood would a woodchuck chuck.";
54
Then we create our customized index which is a specialization of the deferred wotd-Index
55    typedef Index< String<char>, IndexWotd<TMyIndex> > TMyIndex;
56    TMyIndex myIndex(myString);
57
58    cargo(myIndex).replen_max = 10;
59    cargo(myIndex).p_min = 0.05;
60
To find all strings that fulfill our constraints, we simply do a dfs-traversal via goBegin and goNext.
61    typedef Iterator< TMyIndex, TopDown<ParentLinks<> > >::Type TConstrainedIterator;
62    TConstrainedIterator myConstrainedIterator(myIndex);
63
64    goBegin(myConstrainedIterator);
65    while (!atEnd(myConstrainedIterator))
66    {
67
countOccurrences returns the number of hits of the representative.
68        std::cout << countOccurrences(myConstrainedIterator) << "x  ";
69
The representative string can be determined with representative
70        std::cout << "\t\"" << representative(myConstrainedIterator) << '\"' << std::endl;
71
72        goNext(myConstrainedIterator);
73    }
74
75    return 0;
76}
Output
weese@tanne:~/seqan$ cd demos
weese@tanne:~/seqan/demos$ make index_node_predicate
weese@tanne:~/seqan/demos$ ./index_node_predicate
38x     ""
6x      " "
3x      " wo"
2x      " wood"
2x      "a"
4x      "c"
2x      "chuck"
2x      "ck"
3x      "d"
2x      "d "
2x      "huck"
2x      "k"
6x      "o"
2x      "od"
2x      "ood"
3x      "u"
2x      "uck"
4x      "w"
3x      "wo"
2x      "wood"
weese@tanne:~/seqan/demos$
SeqAn - Sequence Analysis Library - www.seqan.de
 

Page built @2013/07/11 09:12:16