Page Mummy

Simple MUMmer clone.

MUMmer is a tool to search for multiple exact matches (MUMs) between 2 given sequences. MUMs can be used as a starting point for a multiple genome alignment algorithm. This example shows how to implement a simple version of MUMer to find multiple exact matches of n sequences (n>=2) in SeqAn.

#include <iostream>
#include <fstream>
#include <seqan/index.h>

using namespace std;
using namespace seqan;


template <typename TIndex>
void findMUMs(TIndex &esa, unsigned minLen)
{
	typename Iterator<TIndex, Mums>::Type it(esa, minLen);  // set minimum MUM length
	String< typename SAValue<TIndex>::Type > occs;          // temp. string storing the hit positions

	cout << resetiosflags(ios::left);
	while (!atEnd(it)) 
	{
		occs = getOccurrences(it);                          // gives hit positions (seqNo,seqOfs)
		orderOccurrences(occs);                             // order them by seqNo
		
		for(unsigned i = 0; i < length(occs); ++i)
			cout << setw(8)
					<< getValueI2(occs[i]) + 1              // output them in MUMmer's output format
					<< "  ";

		cout << setw(8) 
				<< repLength(it)
				<< endl;

		++it;
	}
	cout << setiosflags(ios::left) << endl;
}


template <typename TSpec>
int runMummy(int argc, const char *argv[], unsigned seqCount, unsigned minLen)
{
	typedef String<Dna5, TSpec> TString;
	typedef StringSet<TString>  TStringSet;
	typedef Index<TStringSet>   TIndex;

	TIndex	index;

	// import sequences
	resize(indexText(index), seqCount);
	for(int arg = 1, seq = 0; arg < argc; ++arg) 
	{
		// skip two argument options
		if (strcmp(argv[arg], "-p") == 0 || strcmp(argv[arg], "--profile") == 0 ||
			strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0) 
		{
			++arg;
			continue;
		}

		if (argv[arg][0] != '-') {
			ifstream file;
			file.open(argv[arg], ios_base::in | ios_base::binary);
			if (!file.is_open()) {
				cout << "Import of sequence " << argv[arg] << " failed." << endl;
				return 1;
			}
			read(file, indexText(index)[seq], Fasta());
			file.close();
			++seq;
		}
	}
	cout << lengthSum(indexText(index)) << " bps sequence imported." << endl;

	findMUMs(index, minLen);

	return 0;
}


void printHelp(int, const char *[], bool longHelp = false) 
{
	cout << "***************************************" << endl;
	cout << "***        Simple MUM finder        ***" << endl;
	cout << "*** written by David Weese (c) 2007 ***" << endl;
	cout << "***************************************" << endl << endl;
	cout << "Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>" << endl;
	if (longHelp) {
		cout << endl << "Options:" << endl;
		cout << "  -e, --extern          \t" << "use external memory (for large datasets)" << endl;
		cout << "  -l, --minlen          \t" << "set minimum MUM length" << endl;
		cout << "                        \t" << "if not set, default value is 20" << endl;
		cout << "  -h, --help            \t" << "print this help" << endl;
	} else {
		cout << "Try 'mummy --help' for more information." << endl;
	}
}


int main(int argc, const char *argv[])
{
	if (argc < 2) {
		printHelp(argc, argv);
		return 0;
	}

	unsigned optMinLen = 20;
	bool	 optExtern = false;

	unsigned seqCount = 0;
	for(int arg = 1; arg < argc; ++arg) {
		if (argv[arg][0] == '-') {
			// parse option

			if (strcmp(argv[arg], "-e") == 0 || strcmp(argv[arg], "--extern") == 0) {
				// use external memory algorithms
				optExtern = true;
				continue;
			}
			if (strcmp(argv[arg], "-l") == 0 || strcmp(argv[arg], "--minlen") == 0) {
				// minimum match length
				if (arg + 1 == argc) {
					printHelp(argc, argv);
					return 0;
				}
				++arg;
				optMinLen = atoi(argv[arg]);
				continue;
			}
			if (strcmp(argv[arg], "-h") == 0 || strcmp(argv[arg], "--help") == 0) {
				// print help
				printHelp(argc, argv, true);
				return 0;
			}
		} else {
			// parse sequence file
			++seqCount;
		}
	}

	if (optExtern)
		return runMummy<External<> >(argc, argv, seqCount, optMinLen);
	else
		return runMummy<Alloc<> >(argc, argv, seqCount, optMinLen);
}

If you run the tool on 2 sequences it outputs exactly the same matches as MUMmer (called with -mum option), it only differs in the * order of outputted matches. To get matches with increasing positions at the first sequence we piped the output to sort.

As an example data set we used 3 strains of chlamydia bacterium (NC_002620.fna, NC_000117.fna, NC_007429.fna) and saved the Fasta files to the demos directory.

weese@tanne:~/seqan$ cd demos
weese@tanne:~/seqan/demos$ make index_mummy
weese@tanne:~/seqan/demos$ ./index_mummy -h
***************************************
***        Simple MUM finder        ***
*** written by David Weese (c) 2007 ***
***************************************

Usage: mummy [OPTION]... <SEQUENCE FILE> ... <SEQUENCE FILE>

Options:
  -e, --extern                  use external memory (for large datasets)
  -l, --minlen                  set minimum MUM length
                                if not set, default value is 20
  -h, --help                    print this help
weese@tanne:~/seqan/demos$ ./index_mummy NC*.fna |sort > mums.txt
3159928 bps sequence imported.
weese@tanne:~/seqan/demos$ head mums.txt
    1565    323805      2159        48
    1646    323886      2240        27
    1722    323962      2316        37
    1774    324014      2368        26
    1941    324181      2535        23
    2061    324301      2655        35
    2102    324342      2696        29
    2132    324372      2726        20
    2183    324423      2777        24
weese@tanne:~/seqan/demos$

See Also