Example Program
Mummy
Simple MUMmer clone.
MUMmer is a tool to search for multiple exact matches (MUMs) between 2 given sequences. MUMs can be used as a starting point for a multiple genome alignment algorithm. This example shows how to implement a simple version of MUMer to find multiple exact matches of n sequences (n≥2) in SeqAn.
1#include <iostream>
2#include <fstream>
3#include <seqan/index.h>
4
5using namespace std;
6using namespace seqan;
7
8
9template <typename TIndex>
10void findMUMs(TIndex &esa, unsigned minLen)
11{
12    typename Iterator<TIndex, MUMs>::Type it(esa, minLen);  // set minimum MUM length
13    String< typename SAValue<TIndex>::Type > occs;          // temp. string storing the hit positions
14
15    cout << resetiosflags(ios::left);
16    while (!atEnd(it)) 
17    {
18        occs = getOccurrences(it);                          // gives hit positions (seqNo,seqOfs)
19        orderOccurrences(occs);                             // order them by seqNo
20        
21        for(unsigned i = 0; i < length(occs); ++i)
22            cout << setw(8)
23                    << getValueI2(occs[i]) + 1              // output them in MUMmer's output format
24                    << "  ";
25
26        cout << setw(8) 
27                << repLength(it)
28                << endl;
29
30        ++it;
31    }
32    cout << setiosflags(ios::left) << endl;
33}
34
35
36template <typename TSpec>
37int runMummy(int argc, const char *argv[], unsigned seqCount, unsigned minLen)
38{
39    typedef String<Dna5, TSpec> TString;
40    typedef StringSet<TString>  TStringSet;
41    typedef Index<TStringSet>   TIndex;
42
43    TIndex    index;
44
45    // import sequences
46    resize(indexText(index), seqCount);
47    for(int arg = 1, seq = 0; arg < argc; ++arg) 
48    {
49        // skip two argument options
50        if (strcmp(argv[arg], "-p") == 0 || strcmp(argv[arg], "--profile") == 0 ||
51            strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0) 
52        {
53            ++arg;
54            continue;
55        }
56
57        if (argv[arg][0] != '-') {
58            ifstream file;
59            file.open(argv[arg], ios_base::in | ios_base::binary);
60            if (!file.is_open()) {
61                cerr << "Import of sequence " << argv[arg] << " failed." << endl;
62                return 1;
63            }
64            read(file, indexText(index)[seq], Fasta());
65            file.close();
66            ++seq;
67        }
68    }
69    cerr << lengthSum(indexText(index)) << " bps sequence imported." << endl;
70
71    findMUMs(index, minLen);
72
73    return 0;
74}
75
76
77void printHelp(int, const char *[], bool longHelp = false) 
78{
79    cerr << "***************************************" << endl;
80    cerr << "***        Simple MUM finder        ***" << endl;
81    cerr << "*** written by David Weese (c) 2007 ***" << endl;
82    cerr << "***************************************" << endl << endl;
83    cerr << "Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>" << endl;
84    if (longHelp) {
85        cerr << endl << "Options:" << endl;
86        cerr << "  -e, --extern          \t" << "use external memory (for large datasets)" << endl;
87        cerr << "  -l, --minlen          \t" << "set minimum MUM length" << endl;
88        cerr << "                        \t" << "if not set, default value is 20" << endl;
89        cerr << "  -h, --help            \t" << "print this help" << endl;
90    } else {
91        cerr << "Try 'mummy --help' for more information." << endl;
92    }
93}
94
95
96int main(int argc, const char *argv[])
97{
98    if (argc < 2) {
99        printHelp(argc, argv);
100        return 0;
101    }
102
103    unsigned optMinLen = 20;
104    bool     optExtern = false;
105
106    unsigned seqCount = 0;
107    for(int arg = 1; arg < argc; ++arg) {
108        if (argv[arg][0] == '-') {
109            // parse option
110
111            if (strcmp(argv[arg], "-e") == 0 || strcmp(argv[arg], "--extern") == 0) {
112                // use external memory algorithms
113                optExtern = true;
114                continue;
115            }
116            if (strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0) {
117                // minimum match length
118                if (arg + 1 == argc) {
119                    printHelp(argc, argv);
120                    return 0;
121                }
122                ++arg;
123                optMinLen = atoi(argv[arg]);
124                continue;
125            }
126            if (strcmp(argv[arg], "-h") == 0 || strcmp(argv[arg], "--help") == 0) {
127                // print help
128                printHelp(argc, argv, true);
129                return 0;
130            }
131        } else {
132            // parse sequence file
133            ++seqCount;
134        }
135    }
136
137    if (optExtern)
138        return runMummy<External<> >(argc, argv, seqCount, optMinLen);
139    else
140        return runMummy<Alloc<> >(argc, argv, seqCount, optMinLen);
141}
Output
If you run the tool on 2 sequences it outputs exactly the same matches as MUMmer (called with -mum option), it only differs in the order of outputted matches. To get matches with increasing positions at the first sequence we piped the output to sort.
As an example data set we used 3 strains of chlamydia bacterium (NC_002620.fna, NC_000117.fna, NC_007429.fna) and saved the Fasta files to the demos directory.
weese@tanne:~/seqan$ cd demos
weese@tanne:~/seqan/demos$ make index_mummy
weese@tanne:~/seqan/demos$ ./index_mummy -h
***************************************
***        Simple MUM finder        ***
*** written by David Weese (c) 2007 ***
***************************************

Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>

Options:
  -e, --extern                  use external memory (for large datasets)
  -l, --minlen                  set minimum MUM length
                                if not set, default value is 20
  -h, --help                    print this help
weese@tanne:~/seqan/demos$ ./index_mummy NC*.fna |sort > mums.txt
3159928 bps sequence imported.
weese@tanne:~/seqan/demos$ head mums.txt
    1565    323805      2159        48
    1646    323886      2240        27
    1722    323962      2316        37
    1774    324014      2368        26
    1941    324181      2535        23
    2061    324301      2655        35
    2102    324342      2696        29
    2132    324372      2726        20
    2183    324423      2777        24
weese@tanne:~/seqan/demos$
SeqAn - Sequence Analysis Library - www.seqan.de