SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
alignment_from_cigar.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <vector>
13
19
20namespace seqan3
21{
22
80template <typename reference_type, typename sequence_type>
81inline auto alignment_from_cigar(std::vector<cigar> const & cigar_vector,
82 reference_type const & reference,
83 uint32_t const zero_based_reference_start_position,
84 sequence_type const & query)
85{
86 if (cigar_vector.empty())
87 throw std::logic_error{"An empty CIGAR is not a valid alignment representation."};
88
89 // compute the length of the aligned region in the reference sequence
90 // -------------------------------------------------------------------------
91 // this requires a first stream over the cigar vector.
92 uint32_t reference_length{0};
93 uint32_t query_length{0};
94
95 for (auto [cigar_count, cigar_operation] : cigar_vector)
96 {
97 if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
98 || ('X'_cigar_operation == cigar_operation))
99 {
100 reference_length += cigar_count;
101 query_length += cigar_count;
102 }
103 else if ('D'_cigar_operation == cigar_operation)
104 {
105 reference_length += cigar_count;
106 }
107 else if ('I'_cigar_operation == cigar_operation)
108 {
109 query_length += cigar_count;
110 }
111 }
112
113 if (static_cast<size_t>(zero_based_reference_start_position + reference_length) > std::ranges::size(reference))
114 throw std::logic_error{"The CIGAR string indicates a reference length of at least "
115 + std::to_string(zero_based_reference_start_position + reference_length)
116 + ", but the supplied reference sequence is only of size"
117 + std::to_string(std::ranges::size(reference)) + "."};
118
119 // Get soft clipping at the start and end of the CIGAR string
120 // -------------------------------------------------------------------------
121 uint32_t soft_clipping_start{0};
122 uint32_t soft_clipping_end{0};
123
124 // Checks whether the given index in the cigar vector is a soft clip.
125 auto soft_clipping_at = [&cigar_vector](size_t const index)
126 {
127 return cigar_vector[index] == 'S'_cigar_operation;
128 };
129 // Checks whether the given index in the cigar vector is a hard clip.
130 auto hard_clipping_at = [&](size_t const index)
131 {
132 return cigar_vector[index] == 'H'_cigar_operation;
133 };
134 // Checks whether the given cigar vector has at least min_size many elements.
135 auto vector_size_at_least = [&](size_t const min_size)
136 {
137 return cigar_vector.size() >= min_size;
138 };
139 // Returns the cigar count of the ith cigar element in the given cigar vector.
140 auto cigar_count_at = [&](size_t const index)
141 {
142 return get<0>(cigar_vector[index]);
143 };
144
145 // check for soft clipping at the first two positions
146 // cigar is non-empty, checked at the very beginning.
147 if (soft_clipping_at(0))
148 soft_clipping_start = cigar_count_at(0);
149 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
150 soft_clipping_start = cigar_count_at(1);
151
152 // Check for soft clipping at the last two positions to validate the CIGAR string.
153 // Even if the two following arithmetics overflow, they are protected by the corresponding if expressions below.
154 auto const last_index = cigar_vector.size() - 1;
155 auto const second_last_index = last_index - 1;
156
157 if (vector_size_at_least(2) && soft_clipping_at(last_index))
158 soft_clipping_end = cigar_count_at(last_index);
159 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
160 soft_clipping_end = cigar_count_at(second_last_index);
161
162 if (soft_clipping_start + query_length + soft_clipping_end != std::ranges::size(query))
163 throw std::logic_error{"The CIGAR string indicates a query/read sequence length of "
164 + std::to_string(soft_clipping_start + query_length + soft_clipping_end)
165 + ", but the supplied query/read sequence is of size"
166 + std::to_string(std::ranges::size(query)) + "."};
167
168 // Assign the sequence to the alignment (a tuple of 2 gap decorators)
169 // -------------------------------------------------------------------------
170 using gapped_reference_type = gap_decorator<decltype(reference | views::slice(0, 0))>;
171 using gapped_sequence_type = gap_decorator<decltype(query | views::slice(0, 0))>;
173
174 alignment_type alignment{};
175
176 assign_unaligned(get<0>(alignment),
177 reference
178 | views::slice(zero_based_reference_start_position,
179 zero_based_reference_start_position + reference_length));
180 // query_length already accounts for soft clipping at begin and end
181 assign_unaligned(get<1>(alignment), query | views::slice(soft_clipping_start, soft_clipping_start + query_length));
182
183 // Insert gaps into the alignment based on the cigar vector
184 // -------------------------------------------------------------------------
185 using std::get;
186 auto current_ref_pos = std::ranges::begin(get<0>(alignment));
187 auto current_read_pos = std::ranges::begin(get<1>(alignment));
188
189 for (auto [cigar_count, cigar_operation] : cigar_vector)
190 {
191 if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
192 || ('X'_cigar_operation == cigar_operation))
193 {
194 std::ranges::advance(current_ref_pos, cigar_count);
195 std::ranges::advance(current_read_pos, cigar_count);
196 }
197 else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
198 {
199 // insert gaps into query
200 current_read_pos = get<1>(alignment).insert_gap(current_read_pos, cigar_count);
201 ++current_read_pos;
202 std::ranges::advance(current_ref_pos, cigar_count);
203 }
204 else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
205 {
206 current_ref_pos = get<0>(alignment).insert_gap(current_ref_pos, cigar_count);
207 ++current_ref_pos;
208 std::ranges::advance(current_read_pos, cigar_count);
209 }
210 else if (('P'_cigar_operation == cigar_operation)) // skip padding
211 {
212 current_ref_pos = get<0>(alignment).insert_gap(current_ref_pos, cigar_count);
213 ++current_ref_pos;
214
215 current_read_pos = get<1>(alignment).insert_gap(current_read_pos, cigar_count);
216 ++current_read_pos;
217 }
218 // S and H are ignored as they are handled by cropping the sequence
219 }
220
221 return alignment;
222}
223
227template <typename reference_type, typename sequence_type>
228inline auto alignment_from_cigar(std::string const & cigar_string,
229 reference_type const & reference,
230 uint32_t const zero_based_reference_start_position,
231 sequence_type const & query)
232{
233 alignment_from_cigar(detail::parse_cigar(cigar_string), reference, zero_based_reference_start_position, query);
234}
235
236} // namespace seqan3
Includes the aligned_sequence and the related insert_gap and erase_gap functions to enable stl contai...
Provides the seqan3::cigar alphabet.
T begin(T... args)
A gap decorator allows the annotation of sequences with gap symbols while leaving the underlying sequ...
Definition gap_decorator.hpp:78
T empty(T... args)
Provides seqan3::gap_decorator.
auto alignment_from_cigar(std::vector< cigar > const &cigar_vector, reference_type const &reference, uint32_t const zero_based_reference_start_position, sequence_type const &query)
Construct an alignment from a CIGAR string and the corresponding sequences.
Definition alignment_from_cigar.hpp:81
@ alignment
The (pairwise) alignment stored in an object that models seqan3::detail::pairwise_alignment.
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition slice.hpp:175
Auxiliary functions for the SAM IO.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
T size(T... args)
Provides seqan3::views::slice.
T to_string(T... args)
Hide me