Raptor
A fast and space-efficient pre-filter
All Classes Namespaces Files Functions Variables Macros Pages Concepts
cutoff.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <seqan3/io/sequence_file/format_fasta.hpp>
13
15
16namespace raptor
17{
18
19class cutoff
20{
21public:
22 cutoff() = default;
23 cutoff(cutoff const &) = default;
24 cutoff & operator=(cutoff const &) = default;
25 cutoff(cutoff &&) = default;
26 cutoff & operator=(cutoff &&) = default;
27 ~cutoff() = default;
28
29 cutoff(prepare_arguments const & arguments) : fixed_cutoff{arguments.kmer_count_cutoff}
30 {
31 if (arguments.use_filesize_dependent_cutoff)
32 {
33 cutoff_kind = cutoff_kinds::filesize_dependent;
34 }
35 else
36 {
37 cutoff_kind = cutoff_kinds::fixed;
38 }
39 }
40
41 uint8_t get(std::filesystem::path const & filename) const noexcept
42 {
43 switch (cutoff_kind)
44 {
45 case cutoff_kinds::filesize_dependent:
46 return impl(filename);
47 default:
48 {
49 assert(cutoff_kind == cutoff_kinds::fixed);
50 return fixed_cutoff;
51 }
52 }
53 }
54
55 static inline bool file_is_compressed(std::filesystem::path const & filepath)
56 {
57 std::filesystem::path const extension = filepath.extension();
58 return extension == ".gz" || extension == ".bgzf" || extension == ".bz2";
59 }
60
61private:
62 enum class cutoff_kinds
63 {
64 fixed,
65 filesize_dependent
66 };
67
68 cutoff_kinds cutoff_kind{cutoff_kinds::fixed};
69
70 uint8_t fixed_cutoff{};
71 // Cutoffs and bounds from Mantis
72 // Mantis ignores k-mers which appear less than a certain cutoff. The cutoff is based on the file size of a
73 // gzipped fastq file. Small files have only a cutoff of 1 while big files have a cutoff value of 50.
74 // https://doi.org/10.1016/j.cels.2018.05.021
75 // Supplement Table S1
76 // https://www.cell.com/cms/10.1016/j.cels.2018.05.021/attachment/0a3d402b-8b90-42c0-a709-22f246fd1759/mmc1.pdf
77 static constexpr std::array<uint8_t, 4> const cutoffs{1u, 3u, 10u, 20u};
78 static constexpr std::array<uint64_t, 4> const cutoff_bounds{314'572'800ULL,
79 524'288'000ULL,
80 1'073'741'824ULL,
81 3'221'225'472ULL};
82
83 uint8_t impl(std::filesystem::path const & filename) const
84 {
85 bool const is_compressed = file_is_compressed(filename);
86 bool const is_fasta = check_for_fasta_format(filename);
87
88 // Since the curoffs are based on the filesize of a gzipped fastq file, we try account for the other cases:
89 // We multiply by two if we have fasta input.
90 // We divide by 3 if the input is not compressed.
91 size_t const filesize = std::filesystem::file_size(filename) * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
92
93 uint8_t cutoff{50u};
94 for (size_t i = 0; i < cutoff_bounds.size(); ++i)
95 {
96 if (filesize <= cutoff_bounds[i])
97 {
98 cutoff = cutoffs[i];
99 break;
100 }
101 }
102
103 return cutoff;
104 }
105
106 static inline bool
107 check_for_fasta_format(std::filesystem::path const & filepath,
108 std::vector<std::string> const & valid_extensions = seqan3::format_fasta::file_extensions)
109 {
110 std::string const extension = file_is_compressed(filepath) ? filepath.stem() : filepath.extension();
111
112 auto case_insensitive_string_ends_with = [&](std::string_view str, std::string_view suffix)
113 {
114 size_t const suffix_length{suffix.size()};
115 size_t const str_length{str.size()};
116 return suffix_length > str_length ? false
117 : std::ranges::equal(str.substr(str_length - suffix_length),
118 suffix,
119 [](char const chr1, char const chr2)
120 {
121 return std::tolower(chr1) == std::tolower(chr2);
122 });
123 };
124
125 auto case_insensitive_ends_with = [&](std::string const & ext)
126 {
127 return case_insensitive_string_ends_with(extension, ext);
128 };
129
130 return std::ranges::find_if(valid_extensions, case_insensitive_ends_with) != valid_extensions.end();
131 }
132};
133
134} // namespace raptor
Definition cutoff.hpp:20
T equal(T... args)
T extension(T... args)
T file_size(T... args)
T find_if(T... args)
Provides raptor::prepare_arguments.
T size(T... args)
Definition prepare_arguments.hpp:26
T substr(T... args)
Hide me