@@ -184,48 +184,76 @@ pub fn load_mod_to_feature(constants: &ModelConstants) -> Result<HashMap<String,
184184}
185185
186186
187- // #[derive(Debug, Clone)]
187+ #[ derive( Debug , Clone ) ]
188188pub struct ModificationMap {
189189 pub name : String ,
190190 pub amino_acid : Option < char > , // Optional if not applicable
191+ pub unimod_id : Option < u32 >
191192}
192193
193194
195+ /// Loads a unified modification map where the key is either:
196+ /// - ("57.0215", Some('C')) for mass-based lookup
197+ /// - ("UniMod:4", Some('C')) for UniMod ID–based lookup
194198pub fn load_modifications ( ) -> Result < HashMap < ( String , Option < char > ) , ModificationMap > > {
195199 let path: PathBuf = ensure_mod_tsv_exists ( ) . context ( "Failed to ensure TSV exists" ) ?;
196200
197201 let mut rdr = ReaderBuilder :: new ( )
198202 . delimiter ( b'\t' )
199- . from_path ( path) . context ( "Failed to read TSV file" ) ?;
203+ . from_path ( & path)
204+ . context ( "Failed to read modification TSV file" ) ?;
200205
201206 let mut modifications = HashMap :: new ( ) ;
202-
207+
203208 for result in rdr. records ( ) {
204209 let record = result. context ( "Failed to read record" ) ?;
205210 let mod_name = record. get ( 0 ) . unwrap_or ( "" ) . to_string ( ) ;
206211 let unimod_mass: f64 = record. get ( 1 ) . unwrap_or ( "0" ) . parse ( ) . unwrap_or ( 0.0 ) ;
207-
208- // Convert mass to string with 4 decimal places
212+ let unimod_id : Option < u32 > = record . get ( 7 ) . and_then ( |s| s . parse ( ) . ok ( ) ) ;
213+
209214 let mass_key = format ! ( "{:.4}" , unimod_mass) ;
210-
211- // Extract amino acid from mod_name
215+ let unimod_key = unimod_id . map ( |id| format ! ( "UniMod:{}" , id ) ) ;
216+
212217 let amino_acid = mod_name. split ( '@' ) . nth ( 1 ) . and_then ( |aa| aa. chars ( ) . next ( ) ) ;
213218
214- // Create Modification struct
215219 let modification = ModificationMap {
216220 name : mod_name,
217221 amino_acid,
222+ unimod_id,
218223 } ;
219224
220- // Insert into HashMap
221- modifications. insert ( ( mass_key, amino_acid) , modification) ;
225+ // Insert mass-based key
226+ modifications. insert ( ( mass_key. clone ( ) , amino_acid) , modification. clone ( ) ) ;
227+
228+ // Insert unimod-id based key if available
229+ if let Some ( key) = unimod_key {
230+ modifications. insert ( ( key, amino_acid) , modification. clone ( ) ) ;
231+ }
222232 }
223233
224234 Ok ( modifications)
225235}
226236
237+
238+
239+
240+ /// Removes mass shifts and UniMod annotations from a modified peptide sequence.
241+ ///
242+ /// Supports both bracketed mass shifts (e.g., `[+57.0215]`) and UniMod-style
243+ /// annotations (e.g., `(UniMod:4)`).
244+ ///
245+ /// # Example
246+ /// ```
247+ /// use easypqp_core::data_handling::remove_mass_shift;
248+ ///
249+ /// let peptide = "MGC[+57.0215]AAR";
250+ /// assert_eq!(remove_mass_shift(peptide), "MGCAAR");
251+ /// let peptide = "MGC(UniMod:4)AAR";
252+ /// assert_eq!(remove_mass_shift(peptide), "MGCAAR");
253+ /// ```
227254pub fn remove_mass_shift ( peptide : & str ) -> String {
228- let re = Regex :: new ( r"\[.*?\]" ) . unwrap ( ) ;
255+ // Regex to remove either [mass shift] or (UniMod:x) patterns
256+ let re = Regex :: new ( r"(\[.*?\]|\(UniMod:\d+\))" ) . unwrap ( ) ;
229257 re. replace_all ( peptide, "" ) . to_string ( )
230258}
231259
@@ -283,37 +311,151 @@ pub fn get_modification_indices(peptide: &str) -> String {
283311 indices. join ( ";" )
284312}
285313
286- pub fn get_modification_string (
287- peptide : & str ,
288- modification_map : & HashMap < ( String , Option < char > ) , ModificationMap > ,
289- ) -> String {
290- let naked_peptide = remove_mass_shift ( peptide) ;
291314
292- let extracted_masses_and_indices = extract_masses_and_indices ( & peptide. to_string ( ) ) ;
293315
294- let mut found_modifications = Vec :: new ( ) ;
316+ /// Extracts mass shift annotations (e.g., [+57.0215]) from a peptide string and returns them
317+ /// as a vector of (mass_string, position) where position is the index of the annotated amino acid.
318+ ///
319+ /// # Example
320+ /// ```
321+ /// use redeem_properties::utils::peptdeep_utils::extract_mass_annotations;
322+ /// let result = extract_mass_annotations("AC[+57.0215]DE");
323+ /// assert_eq!(result, vec![("57.0215".to_string(), 2)]);
324+ /// ```
325+ pub fn extract_mass_annotations ( peptide : & str ) -> Vec < ( String , usize ) > {
326+ let re_mass = Regex :: new ( r"\[([+-]?\d*\.?\d+)\]" ) . unwrap ( ) ;
327+ let mut results = Vec :: new ( ) ;
328+ let mut offset = 0 ;
329+ let mut idx = 0 ;
330+
331+ while idx < peptide. len ( ) {
332+ if let Some ( mat) = re_mass. find_at ( peptide, idx) {
333+ if mat. start ( ) == idx {
334+ let cap = re_mass. captures ( & peptide[ idx..mat. end ( ) ] ) . unwrap ( ) ;
335+ let mass_str = format ! ( "{:.4}" , cap[ 1 ] . parse:: <f64 >( ) . unwrap_or( 0.0 ) ) ;
336+ let pos = idx - offset;
337+ results. push ( ( mass_str, pos) ) ;
338+ offset += mat. end ( ) - mat. start ( ) ;
339+ idx = mat. end ( ) ;
340+ continue ;
341+ }
342+ }
343+ idx += peptide[ idx..] . chars ( ) . next ( ) . unwrap ( ) . len_utf8 ( ) ;
344+ }
345+
346+ results
347+ }
295348
296- // Map modifications based on extracted masses and indices
297- for ( mass, index) in extracted_masses_and_indices {
298- // Subtract 1 from index to get 0-based index, ensure it's within bounds
299- let index = index. saturating_sub ( 1 ) ;
300- let amino_acid = naked_peptide. chars ( ) . nth ( index) . unwrap_or ( '\0' ) ;
301349
302- if let Some ( modification) = modification_map
303- . get ( & ( format ! ( "{:.4}" , mass) , Some ( amino_acid) ) )
304- {
305- found_modifications. push ( modification. name . clone ( ) ) ;
306- } else if let Some ( modification) =
307- modification_map. get ( & ( format ! ( "{:.4}" , mass) , None ) )
308- {
309- found_modifications. push ( modification. name . clone ( ) ) ;
350+ /// Extracts UniMod annotations (e.g., (UniMod:4)) from a peptide string and returns them
351+ /// as a vector of (unimod_id_string, position) where position is the index of the annotated amino acid.
352+ ///
353+ /// # Example
354+ /// ```
355+ /// use redeem_properties::utils::peptdeep_utils::extract_unimod_annotations;
356+ /// let result = extract_unimod_annotations("AC(UniMod:4)DE");
357+ /// assert_eq!(result, vec![("UniMod:4".to_string(), 2)]);
358+ /// ```
359+ pub fn extract_unimod_annotations ( peptide : & str ) -> Vec < ( String , usize ) > {
360+ let re_unimod = Regex :: new ( r"\(UniMod:(\d+)\)" ) . unwrap ( ) ;
361+ let mut results = Vec :: new ( ) ;
362+ let mut offset = 0 ;
363+ let mut idx = 0 ;
364+
365+ while idx < peptide. len ( ) {
366+ if let Some ( mat) = re_unimod. find_at ( peptide, idx) {
367+ if mat. start ( ) == idx {
368+ let cap = re_unimod. captures ( & peptide[ idx..mat. end ( ) ] ) . unwrap ( ) ;
369+ let unimod_str = format ! ( "UniMod:{}" , & cap[ 1 ] ) ;
370+ let pos = idx - offset;
371+ results. push ( ( unimod_str, pos) ) ;
372+ offset += mat. end ( ) - mat. start ( ) ;
373+ idx = mat. end ( ) ;
374+ continue ;
375+ }
310376 }
377+ idx += peptide[ idx..] . chars ( ) . next ( ) . unwrap ( ) . len_utf8 ( ) ;
311378 }
312379
313- found_modifications . join ( ";" )
380+ results
314381}
315382
316383
384+ /// Attempts to look up a modification name from a map using the provided key and amino acid.
385+ /// Falls back to a key with `None` if the exact amino acid is not matched.
386+ ///
387+ /// # Example
388+ /// ```
389+ /// use redeem_properties::utils::peptdeep_utils::{ModificationMap, lookup_modification};
390+ /// let mut map = std::collections::HashMap::new();
391+ /// map.insert(("57.0215".to_string(), Some('C')), ModificationMap { name: "Carbamidomethyl@C".to_string(), amino_acid: Some('C'), unimod_id: Some(4) });
392+ ///
393+ /// let result = lookup_modification("57.0215".to_string(), 'C', &map);
394+ /// assert_eq!(result, Some("Carbamidomethyl@C".to_string()));
395+ /// ```
396+ pub fn lookup_modification (
397+ key : String ,
398+ aa : char ,
399+ map : & HashMap < ( String , Option < char > ) , ModificationMap > ,
400+ ) -> Option < String > {
401+ map. get ( & ( key. clone ( ) , Some ( aa) ) )
402+ . or_else ( || map. get ( & ( key, None ) ) )
403+ . map ( |m| m. name . clone ( ) )
404+ }
405+
406+
407+
408+ /// Generates a standardized modification string (e.g., "Carbamidomethyl@C")
409+ /// for a peptide sequence based on mass shifts (e.g., `[+57.0215]`) or
410+ /// UniMod annotations (e.g., `(UniMod:4)`), using a preloaded modification map.
411+ ///
412+ /// The function supports both mass-shift format and UniMod notation,
413+ /// matching entries from the `modification_map` using mass or UniMod ID along
414+ /// with the local amino acid context.
415+ ///
416+ /// # Arguments
417+ /// * `peptide` - A modified peptide sequence string (e.g., `"MGC[+57.0215]AAR"` or `"MGC(UniMod:4)AAR"`).
418+ /// * `modification_map` - A HashMap mapping (key, amino_acid) to `ModificationMap`.
419+ /// - For `[+mass]`, key is formatted as a mass string (e.g., `"57.0215"`).
420+ /// - For `(UniMod:ID)`, key is the UniMod ID as string (e.g., `"4"`).
421+ ///
422+ /// # Returns
423+ /// A `String` containing semicolon-separated modification names (e.g., `"Carbamidomethyl@C"`).
424+ ///
425+ /// # Example
426+ /// ```
427+ /// use std::collections::HashMap;
428+ /// use redeem_properties::utils::peptdeep_utils::{load_modifications, get_modification_string};
429+ ///
430+ /// let mod_map = load_modifications().unwrap();
431+ /// let peptide1 = "MGC[+57.0215]AAR";
432+ /// let result1 = get_modification_string(peptide1, &mod_map);
433+ /// assert_eq!(result1, "Carbamidomethyl@C");
434+ ///
435+ /// let peptide2 = "MGC(UniMod:4)AAR";
436+ /// let result2 = get_modification_string(peptide2, &mod_map);
437+ /// assert_eq!(result2, "Carbamidomethyl@C");
438+ /// ```
439+ pub fn get_modification_string (
440+ peptide : & str ,
441+ modification_map : & HashMap < ( String , Option < char > ) , ModificationMap > ,
442+ ) -> String {
443+ let naked_peptide = remove_mass_shift ( peptide) ;
444+ let mut found_mods = Vec :: new ( ) ;
445+
446+ for ( key, pos) in extract_mass_annotations ( peptide)
447+ . into_iter ( )
448+ . chain ( extract_unimod_annotations ( peptide) )
449+ {
450+ let aa = naked_peptide. chars ( ) . nth ( pos. saturating_sub ( 1 ) ) . unwrap_or ( '\0' ) ;
451+ if let Some ( name) = lookup_modification ( key, aa, modification_map) {
452+ found_mods. push ( name) ;
453+ }
454+ }
455+
456+ found_mods. join ( ";" )
457+ }
458+
317459
318460// TODO: Derive from PeptDep constants yaml
319461const IM_GAS_MASS : f64 = 28.0 ;
0 commit comments