In the current implementation of the Scanless Interface (SLIF) in the Marpa parser, the lexer seems to do longest token matching (LTM) in the following fashion:
This produces frustrating parse fails when my grammar contains tokens that would match the longest substring, but cannot occur at the current position. Consider the following code:
#!/usr/bin/env perl
use strict; use warnings; use feature qw/say/; use utf8;
use Marpa::R2;
use Data::Dump;
my @data = ('! key : value', '! key:value');
my $grammar = Marpa::R2::Scanless::G->new({
    source => \<<'END_GRAMMAR',
        :default ::= action => [values]
        :start   ::= record
        :discard  ~  ws
        ws        ~  [\s]+
        record ::= ('!') key (':') value
        key     ~  [\w]+
        value   ~  [^\s]+
END_GRAMMAR
});
for my $data (@data) {
    my $recce = Marpa::R2::Scanless::R->new({
        grammar => $grammar,
        trace_terminals => 0, # set this to "1" to see how the tokens are recognized
    });
    $recce->read(\$data);
    my $val = $recce->value // die "no parse";
    say ">> $data";
    dd $$val;
}
This produces the output:
>> ! key : value
["key", "value"]
Error in SLIF G1 read: No lexemes accepted at position 2
* Error was at end of input
* String before error: ! key:value
Marpa::R2 exception at marpa.pl line 33.
Expected output:
>> ! key : value
["key", "value"]
>> ! key:value
["key", "value"]
After ! was recognized, a key token must follow. During lexing at this position, the value token matches the longest substring key:value although it cannot occur at this position. Therefore, the parse fails.
Question: Is it possible to achieve the expected output without writing a manual lexer?
(I know that a lexer can query the recognizer for expected tokens, and could restrict itself to matching only these tokens, but I don't know how to convince the SLIF to do this for me.)
I am running Marpa::R2 v2.064 on perl5 v16.2
Following  Jeffrey Kegler's advice, I implemented a rule that will always match a longer substring than a plain value and is therefore preferred. Using a pause event, I can then parse it manually, although I have to keep a phantom rule around for correct semantics.
Here is the full, updated code incl. event handling and an updated test case:
#!/usr/bin/env perl
use strict; use warnings; use feature qw/say/; use utf8;
use Marpa::R2;
use Data::Dump;
my @data = ('! key : value', '! key:value', '! key :value', '! key: value');
my $grammar = Marpa::R2::Scanless::G->new({
    source => \<<'END_GRAMMAR',
        :default ::= action => [values]
        :start   ::= Record
        :discard  ~  ws
        ws        ~  [\s]+
        Record ::=
                ('!') Key (<Op colon>) Value # not directly used
            |   ('!') KeyValue
        Key     ~  key
        Value   ~  value
        KeyValue~  key <ws any> ':' <ws any> value
        :lexeme ~ KeyValue pause => before event => 'before KeyValue'
        <Op colon> ~ ':'
        key     ~  [\w]+
        value   ~  [^\s]+
        <ws any>~  [\s]*
END_GRAMMAR
});
my %events = (
    'before KeyValue' => sub {
        my ($recce, $string, $start, $length) = @_;
        my ($k, $o, $v) = split /(\s*:\s*)/, $string, 2;
        say STDERR qq(k="$k" o="$o" v="$v");
        my $pos = $start;
        $recce->lexeme_read('Key'      => $pos, length($k), $k);
        $pos += length $k;
        $recce->lexeme_read('Op colon' => $pos, length($o), $o);
        $pos += length $o;
        $recce->lexeme_read('Value'    => $pos, length($v), $v);
    },
);
for my $data (@data) {
    my $recce = Marpa::R2::Scanless::R->new({
        grammar => $grammar,
        trace_terminals => 0,
    });
    my $length = length $data;
    for (
        my $pos = $recce->read(\$data);
        $pos < $length;
        $pos = $recce->resume()
    ) {
        say STDERR "pause";
        my ($start, $length) = $recce->pause_span();
        my $str = substr $data, $start, $length;
        for my $event_data (@{ $recce->events }) {
            my ($name) = @$event_data;
            my $code = $events{$name} // die "no code for event $name";
            $recce->$code($str, $start, $length);
        }
    }
    my $val = $recce->value // die "no parse";
    say ">> $data";
    dd $$val;
}
This produces
>> ! key : value
["key", "value"]
>> ! key:value
["key", "value"]
>> ! key :value
["key", "value"]
>> ! key: value
["key", "value"]
which is the expected behaviour.
Please note that since version 2.079_015, Marpa supports the notion of Longest Acceptable Tokens Matching, meaning that just adding:
lexeme default = forgiving => 1
to your grammar will produce the expected output. I.e.:
#!env perl -w
use strict;
use Marpa::R2;
use Data::Dump;
use feature qw/say/;
my $grammar = Marpa::R2::Scanless::G->new({source => \do {local $/; <DATA>}});
my @data = ('! key : value', '! key:value', '! key :value', '! key: value');
foreach (@data) {
    my $r = Marpa::R2::Scanless::R->new({grammar => $grammar});
    $r->read(\$_);
    my $val = $r->value;
    say ">> $_"; dd $$val;
}
__DATA__
:default ::= action => [values]
lexeme default = forgiving => 1
:start   ::= record
:discard  ~  ws
 ws        ~  [\s]+
record ::= ('!') key (':') value
key     ~  [\w]+
value   ~  [^\s]+
will give:
>> ! key : value
["key", "value"]
>> ! key:value
["key", "value"]
>> ! key :value
["key", "value"]
>> ! key: value
["key", "value"]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With