diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58e8054..c8bed87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,14 +54,6 @@ MESSAGE(STATUS "Installation prefix: ${CMAKE_INSTALL_PREFIX}")
 MESSAGE(STATUS "Compiling for processor: ${CMAKE_SYSTEM_PROCESSOR}")
 MESSAGE(STATUS "Compiling with flags:${CMAKE_CXX_FLAGS}")
 
-include_directories(.) # all include paths relative to parent directory
-include_directories(external/pthash/include)
-include_directories(external/pthash/external/bits/include)
-include_directories(external/pthash/external/fastmod)
-include_directories(external/pthash/external/bits/external/essentials/include)
-include_directories(external/pthash/external/xxHash)
-include_directories(external/pthash/external/mm_file/include)
-
 set(Z_LIB_SOURCES
   external/gz/zip_stream.cpp
 )
@@ -74,13 +66,27 @@ set(SSHASH_SOURCES
   src/statistics.cpp
 )
 
+set(SSHASH_INCLUDE_DIRS
+  external/pthash/include
+  external/pthash/external/bits/include
+  external/pthash/external/fastmod
+  external/pthash/external/bits/external/essentials/include
+  external/pthash/external/xxHash
+  external/pthash/external/mm_file/include
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+
 # Create a static lib
 add_library(sshash_static STATIC
   ${Z_LIB_SOURCES}
   ${SSHASH_SOURCES}
 )
 
+target_include_directories(sshash_static PUBLIC ${SSHASH_INCLUDE_DIRS})
+
 add_executable(sshash tools/sshash.cpp)
+target_include_directories(sshash PUBLIC ${SSHASH_INCLUDE_DIRS})
 target_link_libraries(sshash
   z
 )
diff --git a/include/builder/build_sparse_index.hpp b/include/builder/build_sparse_index.hpp
index 5db9084..fe9ccdb 100644
--- a/include/builder/build_sparse_index.hpp
+++ b/include/builder/build_sparse_index.hpp
@@ -82,9 +82,9 @@ buckets_statistics build_sparse_index(parse_data<kmer_t>& data, buckets<kmer_t>&
         }
     }
     offsets.push_back(num_super_kmers);
-    assert(offsets.size() == num_threads + 1);
 
-    std::vector<buckets_statistics> threads_buckets_stats(num_threads);
+    std::vector<buckets_statistics> threads_buckets_stats;
+    threads_buckets_stats.reserve(num_threads);
 
     auto exe = [&](const uint64_t thread_id) {
         assert(thread_id + 1 < offsets.size());
@@ -115,11 +115,12 @@ buckets_statistics build_sparse_index(parse_data<kmer_t>& data, buckets<kmer_t>&
         }
     };
 
-    std::vector<std::thread> threads(num_threads);
-    for (uint64_t thread_id = 0; thread_id != num_threads; ++thread_id) {
-        threads_buckets_stats[thread_id] =
-            buckets_statistics(num_buckets, num_kmers, num_minimizer_positions);
-        threads[thread_id] = std::thread(exe, thread_id);
+    std::vector<std::thread> threads;
+    threads.reserve(num_threads);
+    assert(offsets.size() <= num_threads + 1);
+    for (uint64_t thread_id = 0; thread_id + 1 < size(offsets); ++thread_id) {
+        threads_buckets_stats.emplace_back(num_buckets, num_kmers, num_minimizer_positions);
+        threads.emplace_back(exe, thread_id);
     }
     for (auto& t : threads) {
         if (t.joinable()) t.join();
diff --git a/include/builder/parallel_sort.hpp b/include/builder/parallel_sort.hpp
index 66668e0..9b6c63c 100644
--- a/include/builder/parallel_sort.hpp
+++ b/include/builder/parallel_sort.hpp
@@ -55,18 +55,18 @@ void parallel_merge(Iterator A_begin, Iterator A_end,                   //
     working memory, which must be of same size as data.
 */
 template <typename T, typename Compare>
-void parallel_sort(std::vector<T>& data, const uint64_t num_threads, Compare comp)  //
+void parallel_sort(std::vector<T>& data, uint64_t num_threads, Compare comp)  //
 {
     std::vector<T> temp_data;
     temp_data.resize(data.size());
 
+    const uint64_t data_size = data.size();
+    num_threads = std::min(num_threads, data_size);
     if (num_threads <= 1) {
         std::sort(data.begin(), data.end(), comp);
         return;
     }
 
-    assert((num_threads & (num_threads - 1)) == 0);
-    const uint64_t data_size = data.size();
     const uint64_t chunk_size = (data_size + num_threads - 1) / num_threads;
     const uint64_t sequential_merge_threshold = data_size / uint64_t(std::log2(num_threads));
     assert(sequential_merge_threshold > 0);
@@ -75,12 +75,13 @@ void parallel_sort(std::vector<T>& data, const uint64_t num_threads, Compare com
     threads.reserve(num_threads);
 
     using iterator_t = typename std::vector<T>::iterator;
-    std::vector<std::pair<iterator_t, iterator_t>> ranges(num_threads);
+    std::vector<std::pair<iterator_t, iterator_t>> ranges;
+    ranges.reserve(num_threads);
 
-    for (uint64_t i = 0; i != num_threads; ++i) {
+    for (uint64_t i = 0; i * chunk_size < data_size; ++i) {
         uint64_t begin = i * chunk_size;
-        uint64_t end = (i == num_threads - 1) ? data_size : begin + chunk_size;
-        ranges[i] = {data.begin() + begin, data.begin() + end};
+        uint64_t end = std::min(data_size, begin + chunk_size);
+        ranges.emplace_back(data.begin() + begin, data.begin() + end);
         threads.emplace_back(
             [&, begin, end]() { std::sort(data.begin() + begin, data.begin() + end, comp); });
     }
@@ -94,27 +95,29 @@ void parallel_sort(std::vector<T>& data, const uint64_t num_threads, Compare com
     while (ranges.size() != 1)  //
     {
         next_ranges.clear();
-        for (uint64_t i = 0; i != ranges.size(); i += 2) {
+        for (uint64_t i = 0; i < ranges.size(); i += 2) {
+            auto [begin1, end1] = ranges[i];
+
+            auto input = data.begin();
+            auto output = temp_data.begin();
+            if (swap) std::swap(input, output);
+            uint64_t offset = std::distance(input, begin1);
+            auto output_iterator = output + offset;
+            assert(offset <= data_size);
+            uint64_t output_size = end1 - begin1;
             if (i + 1 < ranges.size()) {
-                auto [begin1, end1] = ranges[i];
                 auto [begin2, end2] = ranges[i + 1];
-                uint64_t output_size = (end1 - begin1) + (end2 - begin2);
-
-                auto input = data.begin();
-                auto output = temp_data.begin();
-                if (swap) std::swap(input, output);
-                uint64_t offset = std::distance(input, begin1);
-                auto output_iterator = output + offset;
-                assert(offset <= data_size);
+                output_size += end2 - begin2;
 
                 parallel_merge(begin1, end1, begin2, end2, output_iterator, comp,
                                sequential_merge_threshold);
                 assert(std::is_sorted(output_iterator, output_iterator + output_size, comp));
-
-                auto merged_begin = output_iterator;
-                auto merged_end = merged_begin + output_size;
-                next_ranges.push_back({merged_begin, merged_end});
+            } else {
+                std::move(begin1, end1, output_iterator);
             }
+            auto merged_begin = output_iterator;
+            auto merged_end = merged_begin + output_size;
+            next_ranges.push_back({merged_begin, merged_end});
         }
         ranges.swap(next_ranges);
         swap = !swap;
diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp
index 0a74583..f269788 100644
--- a/include/builder/parse_file.hpp
+++ b/include/builder/parse_file.hpp
@@ -149,7 +149,7 @@ void parse_file(std::istream& is, parse_data<kmer_t>& data,
 
         uint64_t i = 0;
         if constexpr (kmer_t::bits_per_char == 2) {
-#if !defined(SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING) and defined(__x86_64__)
+#if !defined(SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING) and defined(__AVX2__)
 
             /* process 32 bytes at a time */
             for (; i + 32 <= n; i += 32) {
@@ -214,7 +214,7 @@ void parse_file(std::istream& is, parse_data<kmer_t>& data,
     std::vector<std::thread> threads;
     threads.reserve(num_threads);
 
-    for (uint64_t t = 0; t != num_threads; ++t)  //
+    for (uint64_t t = 0; t * num_sequences_per_thread < num_sequences; ++t)  //
     {
         threads.emplace_back([&, t] {
             std::vector<minimizer_tuple> buffer;
@@ -249,7 +249,7 @@ void parse_file(std::istream& is, parse_data<kmer_t>& data,
             minimizer_iterator<kmer_t> minimizer_it(k, m, hasher);
             minimizer_iterator_rc<kmer_t> minimizer_it_rc(k, m, hasher);
 
-            for (uint64_t i = index_begin; i != index_end; ++i)  //
+            for (uint64_t i = index_begin; i < index_end; ++i)  //
             {
                 const uint64_t begin = data.pieces[i];
                 const uint64_t end = data.pieces[i + 1];
diff --git a/include/builder/util.hpp b/include/builder/util.hpp
index d40d40f..0b9b2cc 100644
--- a/include/builder/util.hpp
+++ b/include/builder/util.hpp
@@ -3,7 +3,7 @@
 #include <vector>
 #include <atomic>
 
-#if defined(__x86_64__)
+#if defined(__AVX2__)
 #include <immintrin.h>
 #include <x86intrin.h>
 #endif
@@ -30,7 +30,7 @@ struct parse_runtime_error : public std::runtime_error {
     }
 }
 
-#if defined(__x86_64__)
+#if defined(__AVX2__)
 /*
     This function takes 32 bytes and packs the two bits
     in positions 1 and 2 (from right) of each byte into
diff --git a/include/dictionary.hpp b/include/dictionary.hpp
index b9d0c33..178a2d1 100644
--- a/include/dictionary.hpp
+++ b/include/dictionary.hpp
@@ -134,6 +134,9 @@ struct dictionary {
         visit_impl(visitor, *this);
     }
 
+    const buckets<kmer_t>& data() const { return m_buckets; }
+    const minimizers& get_minimizers() const { return m_minimizers; }
+
 private:
     template <typename Visitor, typename T>
     static void visit_impl(Visitor& visitor, T&& t) {
diff --git a/include/kmer.hpp b/include/kmer.hpp
index 25edd42..ea6b061 100644
--- a/include/kmer.hpp
+++ b/include/kmer.hpp
@@ -15,7 +15,7 @@ namespace sshash {
 template <typename Kmer, uint8_t BitsPerChar>
 struct uint_kmer_t {
     using uint_t = Kmer;
-    Kmer kmer;
+    Kmer kmer = 0;
 
     uint_kmer_t() {}
     uint_kmer_t(uint64_t kmer) : kmer(kmer) {}
@@ -284,6 +284,19 @@ struct aa_uint_kmer_t : alpha_kmer_t<Kmer, 5, amino_acids> {
 
     static bool is_valid(char c) { return ~char_to_aa[static_cast<size_t>(c)]; }
     static uint64_t char_to_uint(char c) { return char_to_aa[static_cast<size_t>(c)]; }
+
+    // For proteins, there's no reverse complement, so map each character to itself
+    // This allows streaming_query to work with protein alphabets
+    static constexpr char canonicalize_basepair_reverse_map[256] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0, 0, 0, 0, 0,
+        0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
 };
 
 // also supports bitpack<__uint128_t, 1>, std::bitset<256>, etc
diff --git a/include/util.hpp b/include/util.hpp
index 3cb1247..e83fe8d 100644
--- a/include/util.hpp
+++ b/include/util.hpp
@@ -170,7 +170,7 @@ struct build_configuration {
 
 namespace util {
 
-static void check_version_number(essentials::version_number const& vnum) {
+static inline void check_version_number(essentials::version_number const& vnum) {
     if (vnum.x != constants::current_version_number::x) {
         throw std::runtime_error("MAJOR index version mismatch: SSHash index needs rebuilding");
     }
diff --git a/src/build.cpp b/src/build.cpp
index 759326b..117f2d7 100644
--- a/src/build.cpp
+++ b/src/build.cpp
@@ -28,9 +28,6 @@ void dictionary<kmer_t>::build(std::string const& filename,
     if (build_config.l >= constants::max_l) {
         throw std::runtime_error("l must be < " + std::to_string(constants::max_l));
     }
-    if ((build_config.num_threads & (build_config.num_threads - 1)) != 0) {
-        throw std::runtime_error("number of threads must be a power of 2");
-    }
 
     m_k = build_config.k;
     m_m = build_config.m;
@@ -126,11 +123,11 @@ void dictionary<kmer_t>::build(std::string const& filename,
             input.read(reinterpret_cast<char*>(buffer.data()),
                        buffer.size() * sizeof(minimizer_tuple));
             const uint64_t chunk_size = (n + num_threads - 1) / num_threads;
-            for (uint64_t t = 0; t != num_threads; ++t) {
+            for (uint64_t t = 0; t * chunk_size < n; ++t) {
                 uint64_t begin = t * chunk_size;
-                uint64_t end = (t == num_threads - 1) ? n : begin + chunk_size;
+                uint64_t end = std::min(n, begin + chunk_size);
                 threads.emplace_back([begin, end, &buffer, &f]() {
-                    for (uint64_t i = begin; i != end; ++i) {
+                    for (uint64_t i = begin; i < end; ++i) {
                         buffer[i].minimizer = f.lookup(buffer[i].minimizer);
                     }
                 });
diff --git a/src/info.cpp b/src/info.cpp
index 95d73d0..27ccdf9 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -2,10 +2,11 @@
 
 namespace sshash {
 
-static double bits_per_kmer_formula(uint64_t k, /* kmer length */
-                                    uint64_t m, /* minimizer length */
-                                    uint64_t n, /* num. kmers */
-                                    uint64_t M) /* num. strings in SPSS */
+[[maybe_unused]]
+static inline double bits_per_kmer_formula(uint64_t k, /* kmer length */
+                                           uint64_t m, /* minimizer length */
+                                           uint64_t n, /* num. kmers */
+                                           uint64_t M) /* num. strings in SPSS */
 {
     /*
       Caveats:
@@ -35,7 +36,7 @@ static double bits_per_kmer_formula(uint64_t k, /* kmer length */
     return num_bits / n;
 }
 
-double perc(uint64_t amount, uint64_t total) { return (amount * 100.0) / total; }
+inline double perc(uint64_t amount, uint64_t total) { return (amount * 100.0) / total; }
 
 template <class kmer_t>
 void dictionary<kmer_t>::print_space_breakdown() const {
diff --git a/tools/build.cpp b/tools/build.cpp
index cb38f14..9c9d738 100644
--- a/tools/build.cpp
+++ b/tools/build.cpp
@@ -21,7 +21,7 @@ int build(int argc, char** argv) {
     parser.add("seed",
                "Seed for construction (default is " + std::to_string(constants::seed) + ").", "-s",
                false);
-    parser.add("t", "Number of threads (default is 1). Must be a power of 2.", "-t", false);
+    parser.add("t", "Number of threads (default is 1).", "-t", false);
     parser.add("l",
                "A (integer) constant that controls the space/time trade-off of the dictionary. "
                "A reasonable values lies in [2.." +