Raptor
A fast and space-efficient pre-filter
All Classes Namespaces Files Functions Variables Macros Pages Concepts
forward_strand_minimiser.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <deque>
13
14#include <seqan3/alphabet/nucleotide/dna4.hpp>
15#include <seqan3/search/views/kmer_hash.hpp>
16
19
20namespace raptor::threshold
21{
22
23// Minimiser without looking at reverse complement
25{
26private:
28 uint64_t window_size{};
30 seqan3::shape shape{};
32 uint8_t shape_size{};
34 uint64_t seed{};
37
38public:
41
48
53 forward_strand_minimiser(window const window_size_, seqan3::shape const shape_) :
54 window_size{window_size_.v},
55 shape{shape_},
56 shape_size{shape.size()},
57 seed{adjust_seed(shape.count())}
58 {
59 assert(window_size >= shape_size);
60 }
61
66 void resize(window const window_size_, seqan3::shape const shape_)
67 {
68 window_size = window_size_.v;
69 shape = shape_;
70 shape_size = shape.size();
71 seed = adjust_seed(shape.count());
72 assert(window_size >= shape_size);
73 }
74
75 void compute(std::vector<seqan3::dna4> const & text)
76 {
77 assert(window_size && shape_size && seed); // Forgot to initialise/resize?
78
79 size_t const text_length = text.size();
80 assert(shape_size <= text_length);
81 assert(window_size <= text_length);
82
83 uint64_t const max_number_of_minimiser = text_length - window_size + 1u;
84 uint64_t const kmers_per_window = window_size - shape_size + 1u;
85
87 minimiser_begin.reserve(max_number_of_minimiser);
88
89 // Compute all k-mer hashes.
90 auto apply_xor = [this](uint64_t const value)
91 {
92 return value ^ seed;
93 };
94 auto kmer_view = text | seqan3::views::kmer_hash(shape) | std::views::transform(apply_xor);
95 forward_hashes.assign(kmer_view.begin(), kmer_view.end());
96
97 struct kmer
98 {
99 uint64_t hash{};
100 uint64_t position{};
101
102 constexpr auto operator<=>(kmer const & other) const = default;
103 };
104
105 // Stores hash and begin for all k-mers in the window
106 std::deque<kmer> window_hashes;
107
108 // Initialisation. We need to compute all hashes for the first window.
109 for (uint64_t i = 0; i < kmers_per_window; ++i)
110 window_hashes.emplace_back(kmer{.hash = forward_hashes[i], .position = i});
111
112 // The minimum hash is the minimiser. Store the begin position.
113 auto min = std::ranges::min(window_hashes);
114 minimiser_begin.push_back(min.position);
115
116 // For the following windows, we remove the first window k-mer (is now not in window) and add the new k-mer
117 // that results from the window shifting.
118 for (uint64_t i = kmers_per_window; i < max_number_of_minimiser; ++i)
119 {
120 uint64_t const new_hash{forward_hashes[i + kmers_per_window - 1]}; // Already did kmers_per_window - 1 many
121
122 // There are two conditions when we need to recompute the minimum:
123 bool const minimiser_leaves_window = window_hashes.front() == min;
124 bool const new_hash_is_min = new_hash < min.hash;
125
126 // Rolling hash / sliding window
127 window_hashes.pop_front();
128 window_hashes.emplace_back(kmer{.hash = new_hash, .position = i});
129
130 // Update the minimum
131 if (new_hash_is_min || minimiser_leaves_window)
132 {
133 min = new_hash_is_min ? window_hashes.back() : std::ranges::min(window_hashes);
134 minimiser_begin.push_back(min.position);
135 }
136 }
137 return;
138 }
139};
140
141} // namespace raptor::threshold
Provides raptor::adjust_seed.
T assign(T... args)
T back(T... args)
T clear(T... args)
T emplace_back(T... args)
T front(T... args)
T min(T... args)
T pop_front(T... args)
T push_back(T... args)
T reserve(T... args)
T size(T... args)
Provides raptor::window.
Definition forward_strand_minimiser.hpp:25
uint64_t window_size
The window size of the minimiser.
Definition forward_strand_minimiser.hpp:28
forward_strand_minimiser(window const window_size_, seqan3::shape const shape_)
Constructs a minimiser from given k-mer, window size and a seed.
Definition forward_strand_minimiser.hpp:53
std::vector< uint64_t > forward_hashes
Stores the k-mer hashes of the forward strand.
Definition forward_strand_minimiser.hpp:36
forward_strand_minimiser(forward_strand_minimiser &&)=default
Defaulted.
forward_strand_minimiser & operator=(forward_strand_minimiser const &)=default
Defaulted.
uint64_t seed
Random but fixed value to xor k-mers with. Counteracts consecutive minimisers.
Definition forward_strand_minimiser.hpp:34
forward_strand_minimiser & operator=(forward_strand_minimiser &&)=default
Defaulted.
void resize(window const window_size_, seqan3::shape const shape_)
Resize the minimiser.
Definition forward_strand_minimiser.hpp:66
uint8_t shape_size
The size of the shape.
Definition forward_strand_minimiser.hpp:32
std::vector< uint64_t > minimiser_begin
Stores the begin positions of the minimisers.
Definition forward_strand_minimiser.hpp:40
seqan3::shape shape
The shape to use.
Definition forward_strand_minimiser.hpp:30
forward_strand_minimiser(forward_strand_minimiser const &)=default
Defaulted.
Strong type for passing the window size.
Definition strong_types.hpp:19
Hide me