| File: | lib/Parse/EBNF/Rule.pm |
| Coverage: | 81.6% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | package Parse::EBNF::Rule; | |||||
| 2 | ||||||
| 3 | 2 2 2 | 1761 6 22 | use Parse::EBNF::Token; | |||
| 4 | ||||||
| 5 | sub new { | |||||
| 6 | 2 | 9 | my ($class, $rule) = @_; | |||
| 7 | 2 | 10 | my $self = bless {}, $class; | |||
| 8 | 2 | 8 | $self->{error} = 0; | |||
| 9 | ||||||
| 10 | 2 | 11 | $self->parse($rule) if defined $rule; | |||
| 11 | ||||||
| 12 | 2 | 8 | return $self; | |||
| 13 | } | |||||
| 14 | ||||||
| 15 | sub parse { | |||||
| 16 | 22 | 72 | my ($self, $rule) = @_; | |||
| 17 | ||||||
| 18 | 22 | 58 | $self->{error} = 0; | |||
| 19 | ||||||
| 20 | # strip comments | |||||
| 21 | 22 | 59 | $rule =~ s!/\*([^\*]|\*[^\/])*\*\/!!g; | |||
| 22 | ||||||
| 23 | 22 | 110 | unless ($rule =~ m!^\s*\[(\d+)\]\s*([A-Z][a-zA-Z]*)\s*\:\:=!){ | |||
| 24 | ||||||
| 25 | 2 | 13 | $self->{error} = "can't parse rule $rule"; | |||
| 26 | 2 | 5 | return; | |||
| 27 | } | |||||
| 28 | ||||||
| 29 | 20 | 73 | $self->{index} = $1; | |||
| 30 | 20 | 55 | $self->{name} = $2; | |||
| 31 | ||||||
| 32 | 20 | 60 | $rule =~ s!^(.*?)\:\:=!!; | |||
| 33 | ||||||
| 34 | 20 | 54 | $self->{rule} = $rule; | |||
| 35 | ||||||
| 36 | ||||||
| 37 | # now try and tokenise the rule | |||||
| 38 | # we first tokenise it, and *then* split it into alternations, | |||||
| 39 | # since finding the pipes will be tricky if they occur inside | |||||
| 40 | # literals or character classes | |||||
| 41 | ||||||
| 42 | 20 | 45 | my $tokens = []; | |||
| 43 | ||||||
| 44 | 20 | 46 | $rule =~ s/^\s+//; | |||
| 45 | ||||||
| 46 | 20 | 56 | while($rule){ | |||
| 47 | 32 | 61 | my $token = undef; | |||
| 48 | ||||||
| 49 | 32 | 281 | if ($rule =~ m!^'([^']+)'!){ | |||
| 50 | ||||||
| 51 | 3 | 11 | $token = Parse::EBNF::Token->new(); | |||
| 52 | 3 | 10 | $token->{content} = $1; | |||
| 53 | 3 | 9 | $token->{type} = 'literal'; | |||
| 54 | 3 | 14 | $rule = substr $rule, 2 + length $1; | |||
| 55 | ||||||
| 56 | }elsif ($rule =~ m!^"([^"]+)"!){ | |||||
| 57 | ||||||
| 58 | 1 | 3 | $token = Parse::EBNF::Token->new(); | |||
| 59 | 1 | 4 | $token->{content} = $1; | |||
| 60 | 1 | 3 | $token->{type} = 'literal'; | |||
| 61 | 1 | 5 | $rule = substr $rule, 2 + length $1; | |||
| 62 | ||||||
| 63 | }elsif ($rule =~ m!^\|!){ | |||||
| 64 | ||||||
| 65 | 2 | 7 | $token = Parse::EBNF::Token->new(); | |||
| 66 | 2 | 6 | $token->{type} = 'alt'; | |||
| 67 | 2 | 6 | $rule = substr $rule, 1; | |||
| 68 | ||||||
| 69 | }elsif ($rule =~ m!^([A-Z][a-zA-Z]*)!){ | |||||
| 70 | ||||||
| 71 | 12 | 41 | $token = Parse::EBNF::Token->new(); | |||
| 72 | 12 | 42 | $token->{content} = $1; | |||
| 73 | 12 | 36 | $token->{type} = 'subrule'; | |||
| 74 | 12 | 35 | $rule = substr $rule, length $1; | |||
| 75 | ||||||
| 76 | }elsif ($rule =~ m!^\[(\^?)(([^\]]|\\\])+)\]!){ | |||||
| 77 | ||||||
| 78 | # some sort of class - sub-parse it | |||||
| 79 | ||||||
| 80 | 8 | 21 | my $neg = $1; | |||
| 81 | 8 | 16 | my $inner = $2; | |||
| 82 | ||||||
| 83 | 8 | 30 | $rule = substr $rule, 2 + length($neg) + length($inner); | |||
| 84 | ||||||
| 85 | 8 | 18 | my $rx = '['.$neg; | |||
| 86 | 8 | 25 | while(length $inner){ | |||
| 87 | ||||||
| 88 | 18 | 93 | if ($inner =~ m!^#x([0-9a-f]+)-#x([0-9a-f]+)!i){ | |||
| 89 | ||||||
| 90 | 2 | 9 | $inner = substr $inner, 5 + length($1) + length($2); | |||
| 91 | 2 | 6 | $rx .= $self->hexchar($1).'-'.$self->hexchar($2); | |||
| 92 | ||||||
| 93 | }elsif ($inner =~ m!^#x([0-9a-f]+)!i){ | |||||
| 94 | ||||||
| 95 | 6 | 19 | $inner = substr $inner, 2 + length($1); | |||
| 96 | 6 | 19 | $rx .= $self->hexchar($1); | |||
| 97 | ||||||
| 98 | }elsif ($inner =~ m!^([^-])-([^-])!i){ | |||||
| 99 | ||||||
| 100 | 4 | 10 | $inner = substr $inner, 3; | |||
| 101 | 4 | 24 | $rx .= quotemeta($1).'-'.quotemeta($2); | |||
| 102 | ||||||
| 103 | }elsif ($inner =~ m!^([^-])!i){ | |||||
| 104 | ||||||
| 105 | 6 | 15 | $inner = substr $inner, 1; | |||
| 106 | 6 | 25 | $rx .= quotemeta($1); | |||
| 107 | ||||||
| 108 | }else{ | |||||
| 109 | ||||||
| 110 | 0 | 0 | $self->{error} = "couldn't parse class rx at $inner"; | |||
| 111 | 0 | 0 | exit; | |||
| 112 | } | |||||
| 113 | } | |||||
| 114 | 8 | 17 | $rx .= ']'; | |||
| 115 | ||||||
| 116 | 8 | 28 | $token = Parse::EBNF::Token->new(); | |||
| 117 | 8 | 22 | $token->{content} = $rx; | |||
| 118 | 8 | 27 | $token->{type} = 'rx'; | |||
| 119 | ||||||
| 120 | ||||||
| 121 | }elsif ($rule =~ m!^\[(([^\]]|\\\])+)\]!){ | |||||
| 122 | ||||||
| 123 | 0 | 0 | $token = Parse::EBNF::Token->new(); | |||
| 124 | 0 | 0 | $token->{content} = $1; | |||
| 125 | 0 | 0 | $token->{type} = 'class'; | |||
| 126 | 0 | 0 | $rule = substr $rule, 2 + length $1; | |||
| 127 | ||||||
| 128 | }elsif ($rule =~ m!^\*!){ | |||||
| 129 | ||||||
| 130 | 1 | 5 | $token = Parse::EBNF::Token->new(); | |||
| 131 | 1 | 4 | $token->{type} = 'rep star'; | |||
| 132 | 1 | 3 | $rule = substr $rule, 1; | |||
| 133 | ||||||
| 134 | }elsif ($rule =~ m!^\+!){ | |||||
| 135 | ||||||
| 136 | 1 | 3 | $token = Parse::EBNF::Token->new(); | |||
| 137 | 1 | 3 | $token->{type} = 'rep plus'; | |||
| 138 | 1 | 4 | $rule = substr $rule, 1; | |||
| 139 | ||||||
| 140 | }elsif ($rule =~ m!^\?!){ | |||||
| 141 | ||||||
| 142 | 1 | 4 | $token = Parse::EBNF::Token->new(); | |||
| 143 | 1 | 3 | $token->{type} = 'rep quest'; | |||
| 144 | 1 | 4 | $rule = substr $rule, 1; | |||
| 145 | ||||||
| 146 | }elsif ($rule =~ m!^\(!){ | |||||
| 147 | ||||||
| 148 | 1 | 4 | $token = Parse::EBNF::Token->new(); | |||
| 149 | 1 | 4 | $token->{type} = 'group start'; | |||
| 150 | 1 | 3 | $rule = substr $rule, 1; | |||
| 151 | ||||||
| 152 | }elsif ($rule =~ m!^\)!){ | |||||
| 153 | ||||||
| 154 | 1 | 3 | $token = Parse::EBNF::Token->new(); | |||
| 155 | 1 | 3 | $token->{type} = 'group end'; | |||
| 156 | 1 | 3 | $rule = substr $rule, 1; | |||
| 157 | ||||||
| 158 | ||||||
| 159 | }elsif ($rule =~ m!^\-!){ | |||||
| 160 | ||||||
| 161 | 0 | 0 | $token = Parse::EBNF::Token->new(); | |||
| 162 | 0 | 0 | $token->{type} = 'dash'; | |||
| 163 | 0 | 0 | $rule = substr $rule, 1; | |||
| 164 | ||||||
| 165 | }elsif ($rule =~ m!^#x([0-9a-f]+)!i){ | |||||
| 166 | ||||||
| 167 | 1 | 4 | $token = Parse::EBNF::Token->new(); | |||
| 168 | 1 | 4 | $token->{content} = $self->hexchar($1); | |||
| 169 | 1 | 5 | $token->{type} = 'rx'; | |||
| 170 | 1 | 5 | $rule = substr $rule, 2 + length $1; | |||
| 171 | ||||||
| 172 | }else{ | |||||
| 173 | ||||||
| 174 | 0 | 0 | $self->{error} = "couldn't parse token at start of $rule"; | |||
| 175 | 0 | 0 | return; | |||
| 176 | } | |||||
| 177 | ||||||
| 178 | 32 32 | 52 83 | push @{$tokens}, $token; | |||
| 179 | ||||||
| 180 | 32 | 128 | $rule =~ s/^\s+//; | |||
| 181 | } | |||||
| 182 | ||||||
| 183 | # | |||||
| 184 | # first we create a base token (of type list) | |||||
| 185 | # which will represent a list of tokens for this rule | |||||
| 186 | # | |||||
| 187 | ||||||
| 188 | 20 | 63 | my $base = Parse::EBNF::Token->new(); | |||
| 189 | 20 | 67 | $base->{type} = 'list'; | |||
| 190 | 20 | 52 | $base->{tokens} = $tokens; | |||
| 191 | 20 | 50 | $self->{base} = $base; | |||
| 192 | ||||||
| 193 | ||||||
| 194 | # | |||||
| 195 | # now we create a node tree from the flat list | |||||
| 196 | # | |||||
| 197 | ||||||
| 198 | 20 | 81 | return unless $self->produce_groups($base); | |||
| 199 | ||||||
| 200 | ||||||
| 201 | # | |||||
| 202 | # and perform recursive cleanups | |||||
| 203 | # | |||||
| 204 | ||||||
| 205 | 20 | 61 | unless ($base->reduce_alternations()){ | |||
| 206 | 0 | 0 | $self->{error} = $base->{error}; | |||
| 207 | 0 | 0 | return; | |||
| 208 | } | |||||
| 209 | ||||||
| 210 | 20 | 61 | unless ($base->reduce_repetition()){ | |||
| 211 | 0 | 0 | $self->{error} = $base->{error}; | |||
| 212 | 0 | 0 | return; | |||
| 213 | } | |||||
| 214 | ||||||
| 215 | # TODO: negations | |||||
| 216 | ||||||
| 217 | 20 | 61 | unless ($base->reduce_empty()){ | |||
| 218 | 0 | 0 | $self->{error} = $base->{error}; | |||
| 219 | 0 | 0 | return; | |||
| 220 | } | |||||
| 221 | ||||||
| 222 | 20 | 60 | unless ($base->reduce_rx()){ | |||
| 223 | 0 | 0 | $self->{error} = $base->{error}; | |||
| 224 | 0 | 0 | return; | |||
| 225 | } | |||||
| 226 | } | |||||
| 227 | ||||||
| 228 | sub hexchar { | |||||
| 229 | 11 | 32 | my ($self, $char) = @_; | |||
| 230 | ||||||
| 231 | 11 | 20 | $char =~ s!^0+!!; | |||
| 232 | ||||||
| 233 | 11 | 39 | if (hex($char) > 255){ | |||
| 234 | ||||||
| 235 | 0 | 0 | return '\\x{'.lc($char).'}'; | |||
| 236 | }else{ | |||||
| 237 | ||||||
| 238 | 11 | 60 | return '\\x'.lc($char); | |||
| 239 | } | |||||
| 240 | } | |||||
| 241 | ||||||
| 242 | sub produce_groups { | |||||
| 243 | 20 | 51 | my ($self, $base) = @_; | |||
| 244 | ||||||
| 245 | 20 | 52 | my $tokens = $base->{tokens}; | |||
| 246 | 20 | 55 | $base->{tokens} = []; | |||
| 247 | 20 | 36 | my $current = $base; | |||
| 248 | ||||||
| 249 | 20 52 | 33 196 | while(my $token = shift @{$tokens}){ | |||
| 250 | ||||||
| 251 | 32 | 171 | if ($token->{type} eq 'group start'){ | |||
| 252 | ||||||
| 253 | 1 | 3 | my $parent = Parse::EBNF::Token->new(); | |||
| 254 | 1 | 4 | $parent->{type} = 'list'; | |||
| 255 | 1 | 3 | $parent->{parent} = $current; | |||
| 256 | 1 | 3 | $parent->{tokens} = []; | |||
| 257 | ||||||
| 258 | 1 1 | 2 3 | push @{$current->{tokens}}, $parent; | |||
| 259 | ||||||
| 260 | 1 | 6 | $current = $parent; | |||
| 261 | ||||||
| 262 | }elsif ($token->{type} eq 'group end'){ | |||||
| 263 | ||||||
| 264 | 1 | 2 | $current = $current->{parent}; | |||
| 265 | ||||||
| 266 | 1 | 4 | if (!defined($current)){ | |||
| 267 | 0 | 0 | $self->{error} = "end of group found without matching begin in rule $self->{rule}"; | |||
| 268 | 0 | 0 | return 0; | |||
| 269 | } | |||||
| 270 | ||||||
| 271 | }else{ | |||||
| 272 | 30 30 | 51 110 | push @{$current->{tokens}}, $token; | |||
| 273 | } | |||||
| 274 | ||||||
| 275 | } | |||||
| 276 | ||||||
| 277 | 20 | 63 | return 1; | |||
| 278 | } | |||||
| 279 | ||||||
| 280 | sub has_error { | |||||
| 281 | 21 | 52 | my ($self) = @_; | |||
| 282 | 21 | 119 | return $self->{error} ? 1 : 0; | |||
| 283 | } | |||||
| 284 | ||||||
| 285 | sub error { | |||||
| 286 | 0 | 0 | my ($self) = @_; | |||
| 287 | 0 | 0 | return $self->{error} ? $self->{error} : ''; | |||
| 288 | } | |||||
| 289 | ||||||
| 290 | sub base_token { | |||||
| 291 | 54 | 140 | my ($self) = @_; | |||
| 292 | 54 | 358 | return $self->{base}; | |||
| 293 | } | |||||
| 294 | ||||||
| 295 | 1; | |||||
| 296 | ||||||