jermp · jermp · Dec 19, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/.clang-format b/.clang-format
@@ -148,5 +148,3 @@ StatementMacros:
 TabWidth:        8
 UseTab:          Never
 ...
-
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -33,7 +33,7 @@ if (UNIX)
 
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 
   if (SSHASH_USE_SANITIZERS)
@@ -63,7 +63,6 @@ set(SSHASH_SOURCES
   src/dictionary.cpp
   src/query.cpp
   src/info.cpp
-  src/statistics.cpp
 )
 
 set(SSHASH_INCLUDE_DIRS

diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7772316.svg)](https://doi.org/10.5281/zenodo.7772316)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7239205.svg)](https://doi.org/10.5281/zenodo.7239205)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17582116.svg)](https://doi.org/10.5281/zenodo.17582116)
 
 <picture>
   <source media="(prefers-color-scheme: dark)" srcset="img/sshash_on_dark.png">
@@ -24,8 +25,8 @@ The data structure is described in the following papers:
 For a dictionary of n k-mers,
 two basic queries are supported:
 
-- i = **Lookup**(g), where i is in [0,n) if the k-mer g is found in the dictionary or i = -1 otherwise;
-- g = **Access**(i), where g is the k-mer associated to the identifier i.
+- i = **Lookup**(x), where i is in [0,n) if the k-mer x is found in the dictionary or i = -1 otherwise;
+- x = **Access**(i), where x is the k-mer associated to the identifier i.
 
 If also the weights of the k-mers (their frequency counts) are stored in the dictionary, then the dictionary is said to be *weighted* and it also supports:
 
@@ -36,9 +37,9 @@ Other supported queries are:
 - **Membership Queries**: determine if a given k-mer is present in the dictionary or not.
 - **Streaming Queries**: stream through all k-mers of a given DNA file
 (.fasta or .fastq formats) to determine their membership to the dictionary.
-- **Navigational Queries**: given a k-mer g[1..k] determine if g[2..k]+x is present (forward neighbourhood) and if x+g[1..k-1] is present (backward neighbourhood), for x = A, C, G, T ('+' here means string concatenation).
-SSHash internally stores a set of strings, called *contigs* in the following, each associated to a distinct identifier.
-If a contig identifier is specified for a navigational query (rather than a k-mer), then the backward neighbourhood of the first k-mer and the forward neighbourhood of the last k-mer in the contig are returned.
+- **Navigational Queries**: given a k-mer x[1..k] determine if x[2..k]+c is present (forward neighbourhood) and if c+x[1..k-1] is present (backward neighbourhood), for c in {A,C,G,T} ('+' here means string concatenation).
+SSHash internally stores a set of strings, each associated to a distinct identifier.
+If a string identifier is specified for a navigational query (rather than a k-mer), then the backward neighbourhood of the first k-mer and the forward neighbourhood of the last k-mer in the string are returned.
 
 If you are interested in a **membership-only** version of SSHash, have a look at [SSHash-Lite](https://github.com/jermp/sshash-lite). It also works for input files with duplicate k-mers (e.g., [matchtigs](https://github.com/algbio/matchtigs) [4]). For a query sequence S and a given coverage threshold E in [0,1], the sequence is considered to be present in the dictionary if at least E*(|S|-k+1) of the k-mers of S are positive.
 
@@ -76,6 +77,8 @@ To compile the code for a release environment (see file `CMakeLists.txt` for the
     cmake ..
     make -j
 
+**NOTE**: For best performance on `x86` architectures, the option `-D SSHASH_USE_ARCH_NATIVE` can be specified as well.
+
 For a testing environment, use the following instead:
 
     mkdir debug_build
@@ -142,18 +145,6 @@ Tools and Usage
 There is one executable called `sshash` after the compilation, which can be used to run a tool.
 Run `./sshash` as follows to see a list of available tools.
 
-    == SSHash: (S)parse and (S)kew (Hash)ing of k-mers =========================
-
-    Usage: ./sshash <tool> ...
-
-    Available tools:
-      build                  build a dictionary
-      query                  query a dictionary
-      check                  check correctness of a dictionary
-      bench                  run performance tests for a dictionary
-      permute                permute a weighted input file
-      compute-statistics     compute index statistics
-
 For large-scale indexing, it could be necessary to increase the number of file descriptors that can be opened simultaneously:
 
 	ulimit -n 2048
@@ -179,50 +170,50 @@ such collections of stitched unitigs can be obtained from raw FASTA files.
 
 ### Example 1
 
-    ./sshash build -i ../data/unitigs_stitched/salmonella_enterica_k31_ust.fa.gz -k 31 -m 13 --check --bench -o salmonella_enterica.index
+    ./sshash build -i ../data/unitigs_stitched/salmonella_enterica_k31_ust.fa.gz -k 31 -m 13 --check --bench -o salmonella_enterica.sshash
 
 This example builds a dictionary for the k-mers read from the file `../data/unitigs_stitched/salmonella_enterica_k31_ust.fa.gz`,
-with k = 31 and m = 13. It also check the correctness of the dictionary (`--check` option), run a performance benchmark (`--bench` option), and serializes the index on disk to the file `salmonella_enterica.index`.
+with k = 31 and m = 13. It also check the correctness of the dictionary (`--check` option), run a performance benchmark (`--bench` option), and serializes the index on disk to the file `salmonella_enterica.sshash`.
 
 To run a performance benchmark after construction of the index,
 use:
 
-    ./sshash bench -i salmonella_enterica.index
+    ./sshash bench -i salmonella_enterica.sshash
 
 To also store the weights, use the option `--weighted`:
 
     ./sshash build -i ../data/unitigs_stitched/with_weights/salmonella_enterica.ust.k31.fa.gz -k 31 -m 13 --weighted --check --verbose
 
 ### Example 2
 
-    ./sshash build -i ../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz -k 31 -m 15 -l 2 -o salmonella_100.index
+    ./sshash build -i ../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz -k 31 -m 15 -o salmonella_100.sshash
 
-This example builds a dictionary from the input file `../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz` (a pangenome consisting in 100 genomes of *Salmonella Enterica*), with k = 31, m = 15, and l = 2. It also serializes the index on disk to the file `salmonella_100.index`.
+This example builds a dictionary from the input file `../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz` (a pangenome consisting in 100 genomes of *Salmonella Enterica*), with k = 31, m = 15, and l = 2. It also serializes the index on disk to the file `salmonella_100.sshash`.
 
 To perform some streaming membership queries, use:
 
-    ./sshash query -i salmonella_100.index -q ../data/queries/SRR5833294.10K.fastq.gz
+    ./sshash query -i salmonella_100.sshash -q ../data/queries/SRR5833294.10K.fastq.gz
 
 if your queries are meant to be read from a FASTQ file, or
 
-    ./sshash query -i salmonella_100.index -q ../data/queries/salmonella_enterica.fasta.gz --multiline
+    ./sshash query -i salmonella_100.sshash -q ../data/queries/salmonella_enterica.fasta.gz --multiline
 
 if your queries are to be read from a (multi-line) FASTA file.
 
 ### Example 3
 
-    ./sshash build -i ../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz -k 31 -m 13 -l 4 -s 347692 --canonical -o salmonella_100.canon.index
+    ./sshash build -i ../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz -k 31 -m 13 --canonical -o salmonella_100.canon.sshash
 
-This example builds a dictionary from the input file `../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz` (same used in Example 2), with k = 31, m = 13, l = 4, using a seed 347692 for construction (`-s 347692`), and with the canonical parsing modality (option `--canonical`). The dictionary is serialized on disk to the file `salmonella_100.canon.index`.
+This example builds a dictionary from the input file `../data/unitigs_stitched/salmonella_100_k31_ust.fa.gz` (same used in Example 2), with k = 31, m = 13, and with the canonical parsing modality (option `--canonical`). The dictionary is serialized on disk to the file `salmonella_100.canon.sshash`.
 
-The "canonical" version of the dictionary offers more speed for only a little space increase (for a suitable choice of parameters m and l), especially under low-hit workloads -- when the majority of k-mers are not found in the dictionary. (For all details, refer to the paper.)
+The "canonical" version of the dictionary offers more speed for only a little space increase, especially under low-hit workloads -- when the majority of k-mers are not found in the dictionary. (For all details, refer to the paper.)
 
 Below a comparison between the dictionary built in Example 2 (not canonical)
 and the one just built (Example 3, canonical).
 
-    ./sshash query -i salmonella_100.index -q ../data/queries/SRR5833294.10K.fastq.gz
+    ./sshash query -i salmonella_100.sshash -q ../data/queries/SRR5833294.10K.fastq.gz
 
-    ./sshash query -i salmonella_100.canon.index -q ../data/queries/SRR5833294.10K.fastq.gz
+    ./sshash query -i salmonella_100.canon.sshash -q ../data/queries/SRR5833294.10K.fastq.gz
 
 Both queries should originate the following report (reported here for reference):
 
@@ -262,33 +253,24 @@ Input Files
 
 SSHash is meant to index k-mers from collections that **do not contain duplicates
 nor invalid k-mers** (strings containing symbols different from {A,C,G,T}).
-These collections can be obtained, for example, by extracting the maximal unitigs of a de Bruijn graph.
-
-To do so, we can use the tool [BCALM2](https://github.com/GATB/bcalm).
-This tool builds a compacted de Bruijn graph and outputs its maximal unitigs.
-From the output of BCALM2, we can then *stitch* (i.e., glue) some unitigs to reduce the number of nucleotides. The stitiching process is carried out using the [UST](https://github.com/jermp/UST) tool.
+These collections can be obtained, for example, by extracting the maximal unitigs of a de Bruijn graph, or eulertigs, using the [GGCAT](https://github.com/algbio/ggcat) algorithm.
 
 **NOTE**: Input files are expected to have **one DNA sequence per line**. If a sequence spans multiple lines (e.g., multi-fasta), the lines should be concatenated before indexing.
 
-Below we provide a complete example (assuming both BCALM2 and UST are installed correctly) that downloads the Human (GRCh38) Chromosome 13 and extracts the maximal stitiched unitigs for k = 31.
-
-    mkdir DNA_datasets
-    wget http://ftp.ensembl.org/pub/current_fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.13.fa.gz -O DNA_datasets/Homo_sapiens.GRCh38.dna.chromosome.13.fa.gz
-    ~/bcalm/build/bcalm -in ~/DNA_datasets/Homo_sapiens.GRCh38.dna.chromosome.13.fa.gz -kmer-size 31 -abundance-min 1 -nb-cores 8
-    ~/UST/ust -k 31 -i ~/Homo_sapiens.GRCh38.dna.chromosome.13.fa.unitigs.fa
-    gzip Homo_sapiens.GRCh38.dna.chromosome.13.fa.unitigs.fa.ust.fa
-    rm ~/Homo_sapiens.GRCh38.dna.chromosome.13.fa.unitigs.fa
-
 #### Datasets
 
-The script `scripts/download_and_preprocess_datasets.sh`
+The script `scripts/download_and_preprocess_datasets.sh` of [this release](https://github.com/jermp/sshash/releases/tag/v3.0.0)
 contains all the needed steps to download and pre-process
 the datasets that we used in [1].
 
-For the experiments in [2] and [3], we used the datasets available on [Zenodo](https://doi.org/10.5281/zenodo.7772316).
+For the experiments in [2] and [3], we used the datasets available at [https://doi.org/10.5281/zenodo.7772316](https://doi.org/10.5281/zenodo.7772316).
+
+For the latest benchmarks maintained in [this other repository](https://github.com/jermp/kmer_sets_benchmark)
+we used the datasets described at [https://zenodo.org/records/17582116](https://zenodo.org/records/17582116).
 
 #### Weights
-Using the option `-all-abundance-counts` of BCALM2, it is possible to also include the abundance counts of the k-mers in the BCALM2 output. Then, use the option `-a 1` of UST to include such counts in the stitched unitigs.
+
+Using the option `-all-abundance-counts` of [BCALM2](https://github.com/GATB/bcalm), it is possible to also include the abundance counts of the k-mers in the BCALM2 output. Then, use the option `-a 1` of [UST](https://github.com/jermp/UST) to include such counts in the stitched unitigs.
 
 Create a New Release
 --------------------

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,34 +1,29 @@
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7239205.svg)](https://doi.org/10.5281/zenodo.7239205)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.17582116.svg)](https://doi.org/10.5281/zenodo.17582116)
 
 Benchmarks
 ----------
 
-For these benchmarks we used the whole genomes of the following organisms:
+For these benchmarks we used the datasets available here
+[https://zenodo.org/records/17582116](https://zenodo.org/records/17582116).
 
-- Gadus Morhua ("Cod")
-- Falco Tinnunculus ("Kestrel")
-- Homo Sapiens ("Human")
-
-for k = 31 and 63.
+To run the benchmarks, from within the `build` directory, run
 
-The datasets and queries used in these benchmarks can be downloaded
-by running the script
+    python3 ../script/build.py <log_label> <input_datasets_dir> <output_index_dir>
+    python3 ../script/bench.py <log_label> <input_index_dir>
+    python3 ../script/streaming-query-high-hit.py <log_label> <input_index_dir> <input_queries_dir>
 
-```
-bash download-datasets.sh
-```
+where `<log_label>` should be replaced by a suitable basename, e.g., the current date.
 
-To run the benchmarks, from within the `build` directory, run
+These are the results obtained on 10/11/25 (see logs [here](results-10-11-25))
+on a machine equipped with an AMD Ryzen Threadripper PRO 7985WX processor clocked at 5.40GHz.
+The code was compiled with `gcc` 13.3.0.
 
-```
-bash ../script/build.sh [prefix]
-bash ../script/bench.sh [prefix]
-bash ../script/streaming-query-high-hit.sh [prefix]
-bash ../script/streaming-query-low-hit.sh [prefix]
-```
+The indexes were build with a max RAM usage of 16 GB and 64 threads.
+Queries were run using one thread, instead.
 
-where `[prefix]` should be replaced by a suitable basename, e.g., the current date.
+![](results-10-11-25/results.png)
 
-These are the results obtained on 22/08/25 (see logs [here](results-22-08-25)).
+The results can be exported to CSV format with
 
-![](results-22-08-25/results.png)
+    python3 ../script/print_csv.py ../benchmarks/results-10-11-25/k31
+    python3 ../script/print_csv.py ../benchmarks/results-10-11-25/k63
diff --git a/benchmarks/download-datasets.sh b/benchmarks/download-datasets.sh
Original file line number	Diff line number	Diff line change
Expand Up		@@ -148,5 +148,3 @@ StatementMacros:
		TabWidth: 8
		UseTab: Never
		...