Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Iterating through capture fields in a Rust regex

I'm playing around with the frowns parser available from http:// frowns.sourceforge.net, a parser that tokenizes SMILES standard chemical formula strings. Specifically I'm trying to port it to Rust.

The original regex for an "atom" token in the parser looks like this (Python):

element_symbols_pattern = \
  r"C[laroudsemf]?|Os?|N[eaibdpos]?|S[icernbmg]?|P[drmtboau]?|"  \
  r"H[eofgas]?|c|n|o|s|p|A[lrsgutcm]|B[eraik]?|Dy|E[urs]|F[erm]?|"  \
  r"G[aed]|I[nr]?|Kr?|L[iaur]|M[gnodt]|R[buhenaf]|T[icebmalh]|" \
  r"U|V|W|Xe|Yb?|Z[nr]|\*"

atom_fields = [
    "raw_atom",
    "open_bracket",
    "weight",
    "element",
    "chiral_count",
    "chiral_named",
    "chiral_symbols",
    "hcount",
    "positive_count",
    "positive_symbols",
    "negative_count",
    "negative_symbols",
    "error_1",
    "error_2",
    "close_bracket",
    "error_3",
    ]

atom = re.compile(r"""
(?P<raw_atom>Cl|Br|[cnospBCNOFPSI]) |  # "raw" means outside of brackets
(
  (?P<open_bracket>\[)                 # Start bracket
  (?P<weight>\d+)?                     # Atomic weight (optional)
  (                                    # valid term or error
   (                                   #   valid term
    (?P<element>""" + element_symbols_pattern + r""")  # element or aromatic
    (                                  # Chirality can be
     (?P<chiral_count>@\d+) |          #   @1 @2 @3 ...
     (?P<chiral_named>                 # or
       @TH[12] |                       #   @TA1 @TA2
       @AL[12] |                       #   @AL1 @AL2
       @SP[123] |                      #   @SP1 @SP2 @SP3
       @TB(1[0-9]?|20?|[3-9]) |        #   @TB{1-20}
       @OH(1[0-9]?|2[0-9]?|30?|[4-9])) | # @OH{1-30}
     (?P<chiral_symbols>@+)            # or @@@@@@@...
    )?                                 # and chirality is optional
    (?P<hcount>H\d*)?                  # Optional hydrogen count
    (                                  # Charges can be
     (?P<positive_count>\+\d+) |       #   +<number>
     (?P<positive_symbols>\++) |       #   +++...  This includes the single '+'
     (?P<negative_count>-\d+)  |       #   -<number>
     (?P<negative_symbols>-+)          #   ---...  including a single '-'
    )?                                 # and are optional
    (?P<error_1>[^\]]+)?               # If there's anything left, it's an error
  ) | (                                # End of parsing stuff in []s, except
    (?P<error_2>[^\]]*)                # If there was an error, we get here
  ))
  ((?P<close_bracket>\])|              # End bracket
   (?P<error_3>$))                     # unexpectedly reached end of string
)
""", re.X)

The field list is used to improve the reportability of the regex parser, as well as track parsing errors.

I wrote something that compiles and parses tokens without brackets properly, but something about the inclusion of brackets (such as [S] instead of S) breaks it. So I've narrowed it down with comments:

extern crate regex;
use regex::Regex;

fn main() {
    let atom_fields: Vec<&'static str> = vec![
        "raw_atom",
        "open_bracket",
        "weight",
        "element",
        "chiral_count",
        "chiral_named",
        "chiral_symbols",
        "hcount",
        "positive_count",
        "positive_symbols",
        "negative_count",
        "negative_symbols",
        "error_1",
        "error_2",
        "close_bracket",
        "error_3"
    ];

    const EL_SYMBOLS: &'static str = r#"(?P<element>S?|\*")"#;
      let atom_re_str: &String = &String::from(vec![
//    r"(?P<raw_atom>Cl|Br|[cnospBCNOFPSI])|", // "raw" means outside of brackets
        r"(",
        r"(?P<open_bracket>\[)",                 // Start bracket
//      r"(?P<weight>\d+)?",                     // Atomic weight (optional)
        r"(",                                    // valid term or error
         r"(",                                   // valid term    
          &EL_SYMBOLS,                           // element or aromatic
//       r"(",                                  // Chirality can be
//        r"(?P<chiral_count>@\d+)|",           //   @1 @2 @3 ...    
//        r"(?P<chiral_named>",                 // or
//         r"@TH[12]|",                         //   @TA1 @TA2
//         r"@AL[12]|",                         //   @AL1 @AL2
//         r"@SP[123]|",                        //   @SP1 @SP2 @SP3    
//         r"@TB(1[0-9]?|20?|[3-9])|",          //   @TB{1-20}    
//         r"@OH(1[0-9]?|2[0-9]?|30?|[4-9]))|", //   @OH{1-30}    
//         r"(?P<chiral_symbols>@+)",           // or @@@@....,
//        r")?",                                // and chirality is optional    
//       r"(?P<hcount>H\d*)?",                  // Optional hydrogen count    
//       r"(",                                  // Charges can be    
//        r"(?P<positive_count>\+\d+)|",        //   +<number>    
//        r"(?P<positive_symbols>\++)|",        //   +++...including a single '+'    
//        r"(?P<negative_count>-\d+)|",         //   -<number>    
//        r"(?P<negative_symbols>-+)",          //   ---... including a single '-'
//       r")?",                                 // and are optional    
//      r"(?P<error_1>[^\]]+)?",                // anything left is an error    
        r")",                                  // End of stuff in []s, except    
        r"|((?P<error_2>[^\]]*)",                  // If other error, we get here
        r"))",
        r"((?P<close_bracket>\])|",              // End bracket    
        r"(?P<error_3>$)))"].join(""));          // unexpected end of string

    println!("generated regex: {}", &atom_re_str);
    let atom_re = Regex::new(&atom_re_str).unwrap();

    for cur_char in "[S]".chars() {
        let cur_string = cur_char.to_string();
        println!("cur string: {}", &cur_string);
        let captures = atom_re.captures(&cur_string.as_str()).unwrap();
// if captures.name("atom").is_some() {
//   for cur_field in &atom_fields {
//     let field_capture = captures.name(cur_field);
//     if cur_field.contains("error") {
//       if *cur_field == "error_3" {
//         // TODO replace me with a real error
//         println!("current char: {:?}", &cur_char);
//         panic!("Missing a close bracket (]). Looks like: {}.",
//                field_capture.unwrap());
//       } else {
//         panic!("I don't recognize the character. Looks like: {}.",
//                field_capture.unwrap());
//       }      
//     } else {
//       println!("ok! matched {:?}", &cur_char);
//     }
//   }
// }
  }
}

--

You can see that the generated Rust regex works in Debuggex:

((?P<open_bracket>\[)(((?P<element>S?|\*"))|((?P<error_2>[^\]]*)))((?P<close_bracket>\])|(?P<error_3>$)))

Regular expression visualization

(http://debuggex.com/r/7j75Y2F1ph1v9jfL)

If you run the example (https://gitlab.com/araster/frowns_regex), you'll see that the open bracket parses correctly, but the .captures().unwrap() dies on the next character 'S'. If I use the complete expression I can parse all kinds of things from the frowns test file, as long as they don't have brackets.

What am I doing wrong?

like image 787
Alex Raster Avatar asked Jan 24 '26 06:01

Alex Raster


1 Answers

You are iterating on each character of your input string and trying to match the regex on a string composed of a single character. However, this regex is not designed to match individual characters. Indeed, the regex will match [S] as a whole.

If you want to be able to find multiple matches in a single string, use captures_iter instead of captures to iterate on all matches and their respective captures (each match will be a formula, the regex will skip text that doesn't match a formula).

for captures in atom_re.captures_iter("[S]") {
    // check the captures of each match
}

If you only want to find the first match in a string, then use captures on the whole string, rather than on each individual character.

like image 73
Francis Gagné Avatar answered Jan 26 '26 21:01

Francis Gagné



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!