SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
cigar_from_alignment.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <vector>
13
17
18namespace seqan3
19{
20
27{
28 uint32_t hard_front{};
29 uint32_t hard_back{};
30 uint32_t soft_front{};
31 uint32_t soft_back{};
32};
33
110template <typename alignment_type>
111inline auto cigar_from_alignment(alignment_type const & alignment,
112 cigar_clipped_bases const & clipped_bases = {},
113 bool const extended_cigar = false)
114{
116 && std::tuple_size_v<std::remove_cvref_t<alignment_type>> == 2),
117 "The alignment must be a std::pair or std::tuple of size 2.");
118
119 static_assert(
120 (std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(alignment))>>
121 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(alignment))>>),
122 "The alignment must contain two ranges whose value_type is comparable to seqan3::gap.");
123
124 /*brief: Compares two seqan3::aligned_sequence values and returns their cigar operation.
125 * param reference_char The aligned character of the reference to compare.
126 * param query_char The aligned character of the query to compare.
127 * param extended_cigar Whether to print the extended cigar alphabet or not. See cigar operation.
128 * returns A seqan3::cigar::operation representing the alignment operation between the two values.
129 *
130 * The resulting CIGAR operation is based on the query character (`query_char`).
131 *
132 * ### Example:
133 *
134 * The following alignment column shows the reference char ('C') on top and a
135 * gap for the query char at the bottom.
136 * ```
137 * ... C ...
138 * |
139 * ... - ...
140 * ```
141 * In this case, `to_cigar_op` will return
142 * 'D' since the query char is "deleted".
143 *
144 * The next alignment column shows the reference char ('C') on top and a
145 * query char ('G') at the bottom.
146 * ```
147 * ... C ...
148 * |
149 * ... G ...
150 * ```
151 * In this case, `to_cigar_op` will return
152 * 'M', for the basic CIGAR the two bases are aligned, while
153 * in the extended CIGAR alphabet (`extended_cigar` = `true`) the function
154 * will return an 'X' since the bases are aligned but are not
155 * equal.
156 * \sa seqan3::aligned_sequence
157 */
158 auto to_cigar_op = [extended_cigar](auto const reference_char, auto const query_char)
159 {
160 // note that N is not considered because it is equivalent to D but has a special meaning:
161 // SAM spec: "For mRNA-to-genome alignment, an N operation represents an intron. For other types of alignments,
162 // the interpretation of N is not defined."
163 // as we cannot know the meaning, the user has to change D -> N themself
164 constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
165
166 // no gaps -> 00 -> 0
167 // gap only in query -> 01 -> 1
168 // gap only in reference -> 10 -> 2
169 // both gaps -> 11 -> 3
170 uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
171
172 // mismatch -> 100 -> 4
173 // match -> 101 -> 5
174 if (extended_cigar && (key == 0)) // in extended format, refine the substitution operator to match/mismatch.
175 key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
176
177 return assign_char_to(operators[key], cigar::operation{});
178 };
179
180 using std::get;
181
182 auto & ref_seq = get<0>(alignment);
183 auto & query_seq = get<1>(alignment);
184
185 if (ref_seq.size() != query_seq.size())
186 throw std::logic_error{"The aligned sequences (including gaps) must have the same length."};
187
188 if (std::ranges::empty(ref_seq)) // only check ref_seq because query_seq was checked to have to same size
189 throw std::logic_error{"The aligned sequences may not be empty."};
190
191 std::vector<cigar> result{};
192
193 // Add (H)ard-clipping at the start of the query
194 if (clipped_bases.hard_front)
195 result.emplace_back(clipped_bases.hard_front, 'H'_cigar_operation);
196
197 // Add (S)oft-clipping at the start of the query
198 if (clipped_bases.soft_front)
199 result.emplace_back(clipped_bases.soft_front, 'S'_cigar_operation);
200
201 // Create cigar string from alignment
202 // -------------------------------------------------------------------------
203 // initialize first operation and count value:
204 cigar::operation operation{to_cigar_op(ref_seq[0], query_seq[0])};
205 uint32_t count{0};
206
207 // go through alignment columns
208 for (auto && [reference_char, query_char] : views::zip(ref_seq, query_seq))
209 {
210 cigar::operation next_op = to_cigar_op(reference_char, query_char);
211
212 if (operation == next_op)
213 {
214 ++count;
215 }
216 else
217 {
218 result.emplace_back(count, operation);
219 operation = next_op;
220 count = 1;
221 }
222 }
223
224 // append last cigar element
225 result.emplace_back(count, operation);
226
227 // Add (S)oft-clipping at the end of the query
228 if (clipped_bases.soft_back)
229 result.emplace_back(clipped_bases.soft_back, 'S'_cigar_operation);
230
231 // Add (H)ard-clipping at the end of the query
232 if (clipped_bases.hard_back)
233 result.emplace_back(clipped_bases.hard_back, 'H'_cigar_operation);
234
235 return result;
236}
237
238} // namespace seqan3
Provides the seqan3::cigar alphabet.
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition alphabet/cigar/cigar.hpp:93
T emplace_back(T... args)
Provides seqan3::gap.
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:521
auto cigar_from_alignment(alignment_type const &alignment, cigar_clipped_bases const &clipped_bases={}, bool const extended_cigar=false)
Creates a CIGAR string (SAM format) given a seqan3::detail::pairwise_alignment represented by two seq...
Definition cigar_from_alignment.hpp:111
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ alignment
The (pairwise) alignment stored in an object that models seqan3::detail::pairwise_alignment.
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition type_pack/traits.hpp:161
seqan::stl::views::zip zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition zip.hpp:24
Whether a type behaves like a tuple.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
Helper struct to specialise soft and hard clipping when using seqan3::cigar_from_alignment.
Definition cigar_from_alignment.hpp:27
uint32_t hard_front
The number of hard clipped bases at the front of the CIGAR string.
Definition cigar_from_alignment.hpp:28
uint32_t soft_back
The number of soft clipped bases at the back of the CIGAR string.
Definition cigar_from_alignment.hpp:31
uint32_t hard_back
The number of hard clipped bases at the back of the CIGAR string.
Definition cigar_from_alignment.hpp:29
uint32_t soft_front
The number of soft clipped bases at the front of the CIGAR string.
Definition cigar_from_alignment.hpp:30
Provides seqan3::views::zip.
Hide me