From a3d42e030904d6444fbc96bfe4044276ee1bbbff Mon Sep 17 00:00:00 2001 From: Igor Martayan Date: Wed, 21 May 2025 11:44:12 +0200 Subject: [PATCH] Add `revcomp_word` and `to_word_revcomp` to `Seq` --- Cargo.lock | 12 ++++++------ src/ascii.rs | 5 +++++ src/ascii_seq.rs | 5 +++++ src/packed_seq.rs | 14 ++++++++++++++ src/test.rs | 23 +++++++++++++++++++++++ src/traits.rs | 24 ++++++++++++++++++++++++ 6 files changed, 77 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd635e7..4852a28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -22,9 +22,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.0" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] name = "bytemuck" @@ -96,7 +96,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c40d342ff20a2ce62d9a85ce406e672dfa137f902ac9670034533184f1533976" dependencies = [ "anyhow", - "bitflags 2.9.0", + "bitflags 2.9.1", "common_traits", "epserde-derive", "maligned", @@ -203,7 +203,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e3eb8a4c851185832fb250e5c041e74f4408e11fd56263cc1f61b48f9174f5" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.1", "maligned", "mem_dbg-derive", "mmap-rs", @@ -495,7 +495,7 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.1", "byteorder", "enum-as-inner", "libc", @@ -750,7 +750,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.1", ] [[package]] diff --git a/src/ascii.rs b/src/ascii.rs index 5671ea7..8d0e501 100644 --- a/src/ascii.rs +++ b/src/ascii.rs @@ -31,6 +31,11 @@ impl<'s> Seq<'s> for &[u8] { unsafe { (self.as_ptr() as *const usize).read_unaligned() & mask } } + #[inline(always)] + fn to_word_revcomp(&self) -> usize { + unimplemented!("Reverse complement is only defined for DNA sequences, use `AsciiSeq` or `PackedSeq` instead.") + } + /// Convert to an owned version. fn to_vec(&self) -> Vec { <[u8]>::to_vec(self) diff --git a/src/ascii_seq.rs b/src/ascii_seq.rs index 7365283..1fb9acd 100644 --- a/src/ascii_seq.rs +++ b/src/ascii_seq.rs @@ -77,6 +77,11 @@ impl<'s> Seq<'s> for AsciiSeq<'s> { val as usize } + #[inline(always)] + fn to_word_revcomp(&self) -> usize { + Self::revcomp_word(self.to_word(), self.len()) + } + /// Convert to an owned version. fn to_vec(&self) -> AsciiSeqVec { AsciiSeqVec { diff --git a/src/packed_seq.rs b/src/packed_seq.rs index 697f863..a9e34d4 100644 --- a/src/packed_seq.rs +++ b/src/packed_seq.rs @@ -119,6 +119,7 @@ impl<'s> Seq<'s> for PackedSeq<'s> { unpack_base(self.get(index)) } + /// Convert a short sequence (kmer) to a packed representation as `usize`. /// Panics if `self` is longer than 29 characters. #[inline(always)] fn to_word(&self) -> usize { @@ -129,6 +130,19 @@ impl<'s> Seq<'s> for PackedSeq<'s> { } } + /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`. + /// Panics if `self` is longer than 29 characters. + #[inline(always)] + fn to_word_revcomp(&self) -> usize { + debug_assert!(self.len() <= usize::BITS as usize / 2 - 3); + unsafe { + Self::revcomp_word( + (self.seq.as_ptr() as *const usize).read_unaligned() >> (2 * self.offset), + self.len(), + ) + } + } + fn to_vec(&self) -> PackedSeqVec { assert_eq!(self.offset, 0); PackedSeqVec { diff --git a/src/test.rs b/src/test.rs index b1b845d..efdd87f 100644 --- a/src/test.rs +++ b/src/test.rs @@ -523,3 +523,26 @@ fn slice_get() { let get = (0..n).map(|i| s.as_slice().get(i)).collect::>(); assert_eq!(iter_bp, get); } + +#[test] +fn rc_rc() { + let n = 10000; + let seq = PackedSeqVec::random(n); + for k in 1..=29 { + for i in 0..=(n - k) { + let word = seq.slice(i..i + k).to_word(); + let rc = seq.slice(i..i + k).to_word_revcomp(); + assert_eq!(PackedSeq::revcomp_word(word, k), rc, "k={k} i={i}"); + assert_eq!(PackedSeq::revcomp_word(rc, k), word, "k={k} i={i}"); + } + } + let seq = AsciiSeqVec::random(n); + for k in 1..=32 { + for i in 0..=(n - k) { + let word = seq.slice(i..i + k).to_word(); + let rc = seq.slice(i..i + k).to_word_revcomp(); + assert_eq!(AsciiSeq::revcomp_word(word, k), rc, "k={k} i={i}"); + assert_eq!(AsciiSeq::revcomp_word(rc, k), word, "k={k} i={i}"); + } + } +} diff --git a/src/traits.rs b/src/traits.rs index 74c1520..89a43aa 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -38,6 +38,30 @@ pub trait Seq<'s>: Copy + Eq + Ord { /// Convert a short sequence (kmer) to a packed representation as `usize`. fn to_word(&self) -> usize; + /// Convert a short sequence (kmer) to a packed representation of its reverse complement as `usize`. + fn to_word_revcomp(&self) -> usize; + + /// Compute the reverse complement of a short sequence packed in a `usize`. + #[inline(always)] + fn revcomp_word(word: usize, len: usize) -> usize { + #[cfg(any(target_arch = "arm", target_arch = "aarch64"))] + { + let mut res = word.reverse_bits(); // ARM can reverse bits in a single instruction + res = ((res >> 1) & 0x5555_5555_5555_5555) | ((res & 0x5555_5555_5555_5555) << 1); + res ^= 0xAAAA_AAAA_AAAA_AAAA; + res >> (usize::BITS as usize - 2 * len) + } + + #[cfg(not(any(target_arch = "arm", target_arch = "aarch64")))] + { + let mut res = word.swap_bytes(); + res = ((res >> 4) & 0x0F0F_0F0F_0F0F_0F0F) | ((res & 0x0F0F_0F0F_0F0F_0F0F) << 4); + res = ((res >> 2) & 0x3333_3333_3333_3333) | ((res & 0x3333_3333_3333_3333) << 2); + res ^= 0xAAAA_AAAA_AAAA_AAAA; + res >> (usize::BITS as usize - 2 * len) + } + } + /// Convert to an owned version. fn to_vec(&self) -> Self::SeqVec;