Page Mummy
Simple MUMmer clone.
MUMmer is a tool to search for multiple exact matches (MUMs) between 2 given sequences. MUMs can be used as a starting point for a multiple genome alignment algorithm. This example shows how to implement a simple version of MUMer to find multiple exact matches of n sequences (n>=2) in SeqAn.
#include <iostream>
#include <fstream>
#include <seqan/index.h>
#include <seqan/seq_io.h>
using namespace seqan;
template <typename TIndex>
void findMUMs(TIndex & esa, unsigned minLen)
{
typename Iterator<TIndex, Mums>::Type it(esa, minLen); // set minimum MUM length
String<typename SAValue<TIndex>::Type> occs; // temp. string storing the hit positions
std::cout << std::resetiosflags(std::ios::left);
while (!atEnd(it))
{
occs = getOccurrences(it); // gives hit positions (seqNo,seqOfs)
orderOccurrences(occs); // order them by seqNo
for (unsigned i = 0; i < length(occs); ++i)
std::cout << std::setw(8)
<< getValueI2(occs[i]) + 1 // output them in MUMmer's output format
<< " ";
std::cout << std::setw(8)
<< repLength(it)
<< "\n";
++it;
}
std::cout << std::setiosflags(std::ios::left) << "\n";
}
template <typename TSpec>
int runMummy(int argc, const char * argv[], unsigned seqCount, unsigned minLen)
{
typedef String<Dna5, TSpec> TString;
typedef StringSet<TString> TStringSet;
typedef Index<TStringSet> TIndex;
TIndex index;
// import sequences
StringSet<CharString> meta;
for (int arg = 1, seq = 0; arg < argc; ++arg)
{
// skip two argument options
if (strcmp(argv[arg], "-p") == 0 || strcmp(argv[arg], "--profile") == 0 ||
strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0)
{
++arg;
continue;
}
if (argv[arg][0] != '-')
{
SeqFileIn file;
if (!open(file, argv[arg]))
{
std::cout << "Import of sequence " << argv[arg] << " failed.\n";
return 1;
}
readRecords(meta, indexText(index), file);
clear(meta);
close(file);
++seq;
}
}
std::cout << lengthSum(indexText(index)) << " bps sequence imported.\n";
findMUMs(index, minLen);
return 0;
}
void printHelp(int, const char *[], bool longHelp = false)
{
std::cout << "***************************************\n";
std::cout << "*** Simple MUM finder ***\n";
std::cout << "*** written by David Weese (c) 2007 ***\n";
std::cout << "***************************************\n\n";
std::cout << "Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>\n";
if (longHelp)
{
std::cout << "\nOptions:\n";
std::cout << " -e, --extern \tuse external memory (for large datasets)\n";
std::cout << " -l, --minlen \tset minimum MUM length\n";
std::cout << " \tif not set, default value is 20\n";
std::cout << " -h, --help \tprint this help\n";
}
else
{
std::cout << "Try 'mummy --help' for more information.\n";
}
}
int main(int argc, const char * argv[])
{
if (argc < 2)
{
printHelp(argc, argv);
return 0;
}
unsigned optMinLen = 20;
bool optExtern = false;
unsigned seqCount = 0;
for (int arg = 1; arg < argc; ++arg)
{
if (argv[arg][0] == '-')
{
// parse option
if (strcmp(argv[arg], "-e") == 0 || strcmp(argv[arg], "--extern") == 0)
{
// use external memory algorithms
optExtern = true;
continue;
}
if (strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0)
{
// minimum match length
if (arg + 1 == argc)
{
printHelp(argc, argv);
return 0;
}
++arg;
optMinLen = atoi(argv[arg]);
continue;
}
if (strcmp(argv[arg], "-h") == 0 || strcmp(argv[arg], "--help") == 0)
{
// print help
printHelp(argc, argv, true);
return 0;
}
}
else
{
// parse sequence file
++seqCount;
}
}
if (optExtern)
return runMummy<External<> >(argc, argv, seqCount, optMinLen);
else
return runMummy<Alloc<> >(argc, argv, seqCount, optMinLen);
}
If you run the tool on 2 sequences it outputs exactly the same matches as MUMmer (called with -mum option), it only differs in the * order of outputted matches. To get matches with increasing positions at the first sequence we piped the output to sort.
As an example data set we used 3 strains of chlamydia bacterium (NC_002620.fna, NC_000117.fna, NC_007429.fna) and saved the Fasta files to the demos directory.
weese@tanne:~/seqan$ cd demos weese@tanne:~/seqan/demos$ make index_mummy weese@tanne:~/seqan/demos$ ./index_mummy -h *************************************** *** Simple MUM finder *** *** written by David Weese (c) 2007 *** *************************************** Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE> Options: -e, --extern use external memory (for large datasets) -l, --minlen set minimum MUM length if not set, default value is 20 -h, --help print this help weese@tanne:~/seqan/demos$ ./index_mummy NC*.fna |sort > mums.txt 3159928 bps sequence imported. weese@tanne:~/seqan/demos$ head mums.txt 1565 323805 2159 48 1646 323886 2240 27 1722 323962 2316 37 1774 324014 2368 26 1941 324181 2535 23 2061 324301 2655 35 2102 324342 2696 29 2132 324372 2726 20 2183 324423 2777 24 weese@tanne:~/seqan/demos$