Skip to content

Commit 6eecf07

Browse files
committed
refactor: Update peptide modification handling to support mass shifts and UniMod annotations
1 parent f41eeb5 commit 6eecf07

File tree

1 file changed

+174
-32
lines changed

1 file changed

+174
-32
lines changed

crates/redeem-properties/src/utils/peptdeep_utils.rs

Lines changed: 174 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -184,48 +184,76 @@ pub fn load_mod_to_feature(constants: &ModelConstants) -> Result<HashMap<String,
184184
}
185185

186186

187-
// #[derive(Debug, Clone)]
187+
#[derive(Debug, Clone)]
188188
pub struct ModificationMap {
189189
pub name: String,
190190
pub amino_acid: Option<char>, // Optional if not applicable
191+
pub unimod_id: Option<u32>
191192
}
192193

193194

195+
/// Loads a unified modification map where the key is either:
196+
/// - ("57.0215", Some('C')) for mass-based lookup
197+
/// - ("UniMod:4", Some('C')) for UniMod ID–based lookup
194198
pub fn load_modifications() -> Result<HashMap<(String, Option<char>), ModificationMap>> {
195199
let path: PathBuf = ensure_mod_tsv_exists().context("Failed to ensure TSV exists")?;
196200

197201
let mut rdr = ReaderBuilder::new()
198202
.delimiter(b'\t')
199-
.from_path(path).context("Failed to read TSV file")?;
203+
.from_path(&path)
204+
.context("Failed to read modification TSV file")?;
200205

201206
let mut modifications = HashMap::new();
202-
207+
203208
for result in rdr.records() {
204209
let record = result.context("Failed to read record")?;
205210
let mod_name = record.get(0).unwrap_or("").to_string();
206211
let unimod_mass: f64 = record.get(1).unwrap_or("0").parse().unwrap_or(0.0);
207-
208-
// Convert mass to string with 4 decimal places
212+
let unimod_id: Option<u32> = record.get(7).and_then(|s| s.parse().ok());
213+
209214
let mass_key = format!("{:.4}", unimod_mass);
210-
211-
// Extract amino acid from mod_name
215+
let unimod_key = unimod_id.map(|id| format!("UniMod:{}", id));
216+
212217
let amino_acid = mod_name.split('@').nth(1).and_then(|aa| aa.chars().next());
213218

214-
// Create Modification struct
215219
let modification = ModificationMap {
216220
name: mod_name,
217221
amino_acid,
222+
unimod_id,
218223
};
219224

220-
// Insert into HashMap
221-
modifications.insert((mass_key, amino_acid), modification);
225+
// Insert mass-based key
226+
modifications.insert((mass_key.clone(), amino_acid), modification.clone());
227+
228+
// Insert unimod-id based key if available
229+
if let Some(key) = unimod_key {
230+
modifications.insert((key, amino_acid), modification.clone());
231+
}
222232
}
223233

224234
Ok(modifications)
225235
}
226236

237+
238+
239+
240+
/// Removes mass shifts and UniMod annotations from a modified peptide sequence.
241+
///
242+
/// Supports both bracketed mass shifts (e.g., `[+57.0215]`) and UniMod-style
243+
/// annotations (e.g., `(UniMod:4)`).
244+
///
245+
/// # Example
246+
/// ```
247+
/// use easypqp_core::data_handling::remove_mass_shift;
248+
///
249+
/// let peptide = "MGC[+57.0215]AAR";
250+
/// assert_eq!(remove_mass_shift(peptide), "MGCAAR");
251+
/// let peptide = "MGC(UniMod:4)AAR";
252+
/// assert_eq!(remove_mass_shift(peptide), "MGCAAR");
253+
/// ```
227254
pub fn remove_mass_shift(peptide: &str) -> String {
228-
let re = Regex::new(r"\[.*?\]").unwrap();
255+
// Regex to remove either [mass shift] or (UniMod:x) patterns
256+
let re = Regex::new(r"(\[.*?\]|\(UniMod:\d+\))").unwrap();
229257
re.replace_all(peptide, "").to_string()
230258
}
231259

@@ -283,37 +311,151 @@ pub fn get_modification_indices(peptide: &str) -> String {
283311
indices.join(";")
284312
}
285313

286-
pub fn get_modification_string(
287-
peptide: &str,
288-
modification_map: &HashMap<(String, Option<char>), ModificationMap>,
289-
) -> String {
290-
let naked_peptide = remove_mass_shift(peptide);
291314

292-
let extracted_masses_and_indices = extract_masses_and_indices(&peptide.to_string());
293315

294-
let mut found_modifications = Vec::new();
316+
/// Extracts mass shift annotations (e.g., [+57.0215]) from a peptide string and returns them
317+
/// as a vector of (mass_string, position) where position is the index of the annotated amino acid.
318+
///
319+
/// # Example
320+
/// ```
321+
/// use redeem_properties::utils::peptdeep_utils::extract_mass_annotations;
322+
/// let result = extract_mass_annotations("AC[+57.0215]DE");
323+
/// assert_eq!(result, vec![("57.0215".to_string(), 2)]);
324+
/// ```
325+
pub fn extract_mass_annotations(peptide: &str) -> Vec<(String, usize)> {
326+
let re_mass = Regex::new(r"\[([+-]?\d*\.?\d+)\]").unwrap();
327+
let mut results = Vec::new();
328+
let mut offset = 0;
329+
let mut idx = 0;
330+
331+
while idx < peptide.len() {
332+
if let Some(mat) = re_mass.find_at(peptide, idx) {
333+
if mat.start() == idx {
334+
let cap = re_mass.captures(&peptide[idx..mat.end()]).unwrap();
335+
let mass_str = format!("{:.4}", cap[1].parse::<f64>().unwrap_or(0.0));
336+
let pos = idx - offset;
337+
results.push((mass_str, pos));
338+
offset += mat.end() - mat.start();
339+
idx = mat.end();
340+
continue;
341+
}
342+
}
343+
idx += peptide[idx..].chars().next().unwrap().len_utf8();
344+
}
345+
346+
results
347+
}
295348

296-
// Map modifications based on extracted masses and indices
297-
for (mass, index) in extracted_masses_and_indices {
298-
// Subtract 1 from index to get 0-based index, ensure it's within bounds
299-
let index = index.saturating_sub(1);
300-
let amino_acid = naked_peptide.chars().nth(index).unwrap_or('\0');
301349

302-
if let Some(modification) = modification_map
303-
.get(&(format!("{:.4}", mass), Some(amino_acid)))
304-
{
305-
found_modifications.push(modification.name.clone());
306-
} else if let Some(modification) =
307-
modification_map.get(&(format!("{:.4}", mass), None))
308-
{
309-
found_modifications.push(modification.name.clone());
350+
/// Extracts UniMod annotations (e.g., (UniMod:4)) from a peptide string and returns them
351+
/// as a vector of (unimod_id_string, position) where position is the index of the annotated amino acid.
352+
///
353+
/// # Example
354+
/// ```
355+
/// use redeem_properties::utils::peptdeep_utils::extract_unimod_annotations;
356+
/// let result = extract_unimod_annotations("AC(UniMod:4)DE");
357+
/// assert_eq!(result, vec![("UniMod:4".to_string(), 2)]);
358+
/// ```
359+
pub fn extract_unimod_annotations(peptide: &str) -> Vec<(String, usize)> {
360+
let re_unimod = Regex::new(r"\(UniMod:(\d+)\)").unwrap();
361+
let mut results = Vec::new();
362+
let mut offset = 0;
363+
let mut idx = 0;
364+
365+
while idx < peptide.len() {
366+
if let Some(mat) = re_unimod.find_at(peptide, idx) {
367+
if mat.start() == idx {
368+
let cap = re_unimod.captures(&peptide[idx..mat.end()]).unwrap();
369+
let unimod_str = format!("UniMod:{}", &cap[1]);
370+
let pos = idx - offset;
371+
results.push((unimod_str, pos));
372+
offset += mat.end() - mat.start();
373+
idx = mat.end();
374+
continue;
375+
}
310376
}
377+
idx += peptide[idx..].chars().next().unwrap().len_utf8();
311378
}
312379

313-
found_modifications.join(";")
380+
results
314381
}
315382

316383

384+
/// Attempts to look up a modification name from a map using the provided key and amino acid.
385+
/// Falls back to a key with `None` if the exact amino acid is not matched.
386+
///
387+
/// # Example
388+
/// ```
389+
/// use redeem_properties::utils::peptdeep_utils::{ModificationMap, lookup_modification};
390+
/// let mut map = std::collections::HashMap::new();
391+
/// map.insert(("57.0215".to_string(), Some('C')), ModificationMap { name: "Carbamidomethyl@C".to_string(), amino_acid: Some('C'), unimod_id: Some(4) });
392+
///
393+
/// let result = lookup_modification("57.0215".to_string(), 'C', &map);
394+
/// assert_eq!(result, Some("Carbamidomethyl@C".to_string()));
395+
/// ```
396+
pub fn lookup_modification(
397+
key: String,
398+
aa: char,
399+
map: &HashMap<(String, Option<char>), ModificationMap>,
400+
) -> Option<String> {
401+
map.get(&(key.clone(), Some(aa)))
402+
.or_else(|| map.get(&(key, None)))
403+
.map(|m| m.name.clone())
404+
}
405+
406+
407+
408+
/// Generates a standardized modification string (e.g., "Carbamidomethyl@C")
409+
/// for a peptide sequence based on mass shifts (e.g., `[+57.0215]`) or
410+
/// UniMod annotations (e.g., `(UniMod:4)`), using a preloaded modification map.
411+
///
412+
/// The function supports both mass-shift format and UniMod notation,
413+
/// matching entries from the `modification_map` using mass or UniMod ID along
414+
/// with the local amino acid context.
415+
///
416+
/// # Arguments
417+
/// * `peptide` - A modified peptide sequence string (e.g., `"MGC[+57.0215]AAR"` or `"MGC(UniMod:4)AAR"`).
418+
/// * `modification_map` - A HashMap mapping (key, amino_acid) to `ModificationMap`.
419+
/// - For `[+mass]`, key is formatted as a mass string (e.g., `"57.0215"`).
420+
/// - For `(UniMod:ID)`, key is the UniMod ID as string (e.g., `"4"`).
421+
///
422+
/// # Returns
423+
/// A `String` containing semicolon-separated modification names (e.g., `"Carbamidomethyl@C"`).
424+
///
425+
/// # Example
426+
/// ```
427+
/// use std::collections::HashMap;
428+
/// use redeem_properties::utils::peptdeep_utils::{load_modifications, get_modification_string};
429+
///
430+
/// let mod_map = load_modifications().unwrap();
431+
/// let peptide1 = "MGC[+57.0215]AAR";
432+
/// let result1 = get_modification_string(peptide1, &mod_map);
433+
/// assert_eq!(result1, "Carbamidomethyl@C");
434+
///
435+
/// let peptide2 = "MGC(UniMod:4)AAR";
436+
/// let result2 = get_modification_string(peptide2, &mod_map);
437+
/// assert_eq!(result2, "Carbamidomethyl@C");
438+
/// ```
439+
pub fn get_modification_string(
440+
peptide: &str,
441+
modification_map: &HashMap<(String, Option<char>), ModificationMap>,
442+
) -> String {
443+
let naked_peptide = remove_mass_shift(peptide);
444+
let mut found_mods = Vec::new();
445+
446+
for (key, pos) in extract_mass_annotations(peptide)
447+
.into_iter()
448+
.chain(extract_unimod_annotations(peptide))
449+
{
450+
let aa = naked_peptide.chars().nth(pos.saturating_sub(1)).unwrap_or('\0');
451+
if let Some(name) = lookup_modification(key, aa, modification_map) {
452+
found_mods.push(name);
453+
}
454+
}
455+
456+
found_mods.join(";")
457+
}
458+
317459

318460
// TODO: Derive from PeptDep constants yaml
319461
const IM_GAS_MASS: f64 = 28.0;

0 commit comments

Comments
 (0)