Page Mummy

Simple MUMmer clone.

MUMmer is a tool to search for multiple exact matches (MUMs) between 2 given sequences. MUMs can be used as a starting point for a multiple genome alignment algorithm. This example shows how to implement a simple version of MUMer to find multiple exact matches of n sequences (n>=2) in SeqAn.

#include <iostream>
#include <fstream>
#include <seqan/index.h>
#include <seqan/seq_io.h>

using namespace seqan;


template <typename TIndex>
void findMUMs(TIndex & esa, unsigned minLen)
{
    typename Iterator<TIndex, Mums>::Type it(esa, minLen);  // set minimum MUM length
    String<typename SAValue<TIndex>::Type> occs;            // temp. string storing the hit positions

    std::cout << std::resetiosflags(std::ios::left);
    while (!atEnd(it))
    {
        occs = getOccurrences(it);                          // gives hit positions (seqNo,seqOfs)
        orderOccurrences(occs);                             // order them by seqNo

        for (unsigned i = 0; i < length(occs); ++i)
            std::cout << std::setw(8)
                      << getValueI2(occs[i]) + 1            // output them in MUMmer's output format
                      << "  ";

        std::cout << std::setw(8)
                  << repLength(it)
                  << "\n";

        ++it;
    }
    std::cout << std::setiosflags(std::ios::left) << "\n";
}

template <typename TSpec>
int runMummy(int argc, const char * argv[], unsigned seqCount, unsigned minLen)
{
    typedef String<Dna5, TSpec> TString;
    typedef StringSet<TString>  TStringSet;
    typedef Index<TStringSet>   TIndex;

    TIndex  index;

    // import sequences
    StringSet<CharString> meta;
    for (int arg = 1, seq = 0; arg < argc; ++arg)
    {
        // skip two argument options
        if (strcmp(argv[arg], "-p") == 0 || strcmp(argv[arg], "--profile") == 0 ||
            strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0)
        {
            ++arg;
            continue;
        }

        if (argv[arg][0] != '-')
        {
            SeqFileIn file;
            if (!open(file, argv[arg]))
            {
                std::cout << "Import of sequence " << argv[arg] << " failed.\n";
                return 1;
            }
            readRecords(meta, indexText(index), file);
            clear(meta);
            close(file);
            ++seq;
        }
    }
    std::cout << lengthSum(indexText(index)) << " bps sequence imported.\n";

    findMUMs(index, minLen);

    return 0;
}

void printHelp(int, const char *[], bool longHelp = false)
{
    std::cout << "***************************************\n";
    std::cout << "***        Simple MUM finder        ***\n";
    std::cout << "*** written by David Weese (c) 2007 ***\n";
    std::cout << "***************************************\n\n";
    std::cout << "Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>\n";
    if (longHelp)
    {
        std::cout << "\nOptions:\n";
        std::cout << "  -e, --extern          \tuse external memory (for large datasets)\n";
        std::cout << "  -l, --minlen          \tset minimum MUM length\n";
        std::cout << "                        \tif not set, default value is 20\n";
        std::cout << "  -h, --help            \tprint this help\n";
    }
    else
    {
        std::cout << "Try 'mummy --help' for more information.\n";
    }
}

int main(int argc, const char * argv[])
{
    if (argc < 2)
    {
        printHelp(argc, argv);
        return 0;
    }

    unsigned optMinLen = 20;
    bool     optExtern = false;

    unsigned seqCount = 0;
    for (int arg = 1; arg < argc; ++arg)
    {
        if (argv[arg][0] == '-')
        {
            // parse option

            if (strcmp(argv[arg], "-e") == 0 || strcmp(argv[arg], "--extern") == 0)
            {
                // use external memory algorithms
                optExtern = true;
                continue;
            }
            if (strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0)
            {
                // minimum match length
                if (arg + 1 == argc)
                {
                    printHelp(argc, argv);
                    return 0;
                }
                ++arg;
                optMinLen = atoi(argv[arg]);
                continue;
            }
            if (strcmp(argv[arg], "-h") == 0 || strcmp(argv[arg], "--help") == 0)
            {
                // print help
                printHelp(argc, argv, true);
                return 0;
            }
        }
        else
        {
            // parse sequence file
            ++seqCount;
        }
    }

    if (optExtern)
        return runMummy<External<> >(argc, argv, seqCount, optMinLen);
    else
        return runMummy<Alloc<> >(argc, argv, seqCount, optMinLen);
}

If you run the tool on 2 sequences it outputs exactly the same matches as MUMmer (called with -mum option), it only differs in the * order of outputted matches. To get matches with increasing positions at the first sequence we piped the output to sort.

As an example data set we used 3 strains of chlamydia bacterium (NC_002620.fna, NC_000117.fna, NC_007429.fna) and saved the Fasta files to the demos directory.

weese@tanne:~/seqan$ cd demos
weese@tanne:~/seqan/demos$ make index_mummy
weese@tanne:~/seqan/demos$ ./index_mummy -h
***************************************
***        Simple MUM finder        ***
*** written by David Weese (c) 2007 ***
***************************************

Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>

Options:
  -e, --extern                  use external memory (for large datasets)
  -l, --minlen                  set minimum MUM length
                                if not set, default value is 20
  -h, --help                    print this help
weese@tanne:~/seqan/demos$ ./index_mummy NC*.fna |sort > mums.txt
3159928 bps sequence imported.
weese@tanne:~/seqan/demos$ head mums.txt
    1565    323805      2159        48
    1646    323886      2240        27
    1722    323962      2316        37
    1774    324014      2368        26
    1941    324181      2535        23
    2061    324301      2655        35
    2102    324342      2696        29
    2132    324372      2726        20
    2183    324423      2777        24
weese@tanne:~/seqan/demos$

See Also