File: | lib/Parse/EBNF/Rule.pm |
Coverage: | 81.6% |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | package Parse::EBNF::Rule; | |||||
2 | ||||||
3 | 2 2 2 | 1761 6 22 | use Parse::EBNF::Token; | |||
4 | ||||||
5 | sub new { | |||||
6 | 2 | 9 | my ($class, $rule) = @_; | |||
7 | 2 | 10 | my $self = bless {}, $class; | |||
8 | 2 | 8 | $self->{error} = 0; | |||
9 | ||||||
10 | 2 | 11 | $self->parse($rule) if defined $rule; | |||
11 | ||||||
12 | 2 | 8 | return $self; | |||
13 | } | |||||
14 | ||||||
15 | sub parse { | |||||
16 | 22 | 72 | my ($self, $rule) = @_; | |||
17 | ||||||
18 | 22 | 58 | $self->{error} = 0; | |||
19 | ||||||
20 | # strip comments | |||||
21 | 22 | 59 | $rule =~ s!/\*([^\*]|\*[^\/])*\*\/!!g; | |||
22 | ||||||
23 | 22 | 110 | unless ($rule =~ m!^\s*\[(\d+)\]\s*([A-Z][a-zA-Z]*)\s*\:\:=!){ | |||
24 | ||||||
25 | 2 | 13 | $self->{error} = "can't parse rule $rule"; | |||
26 | 2 | 5 | return; | |||
27 | } | |||||
28 | ||||||
29 | 20 | 73 | $self->{index} = $1; | |||
30 | 20 | 55 | $self->{name} = $2; | |||
31 | ||||||
32 | 20 | 60 | $rule =~ s!^(.*?)\:\:=!!; | |||
33 | ||||||
34 | 20 | 54 | $self->{rule} = $rule; | |||
35 | ||||||
36 | ||||||
37 | # now try and tokenise the rule | |||||
38 | # we first tokenise it, and *then* split it into alternations, | |||||
39 | # since finding the pipes will be tricky if they occur inside | |||||
40 | # literals or character classes | |||||
41 | ||||||
42 | 20 | 45 | my $tokens = []; | |||
43 | ||||||
44 | 20 | 46 | $rule =~ s/^\s+//; | |||
45 | ||||||
46 | 20 | 56 | while($rule){ | |||
47 | 32 | 61 | my $token = undef; | |||
48 | ||||||
49 | 32 | 281 | if ($rule =~ m!^'([^']+)'!){ | |||
50 | ||||||
51 | 3 | 11 | $token = Parse::EBNF::Token->new(); | |||
52 | 3 | 10 | $token->{content} = $1; | |||
53 | 3 | 9 | $token->{type} = 'literal'; | |||
54 | 3 | 14 | $rule = substr $rule, 2 + length $1; | |||
55 | ||||||
56 | }elsif ($rule =~ m!^"([^"]+)"!){ | |||||
57 | ||||||
58 | 1 | 3 | $token = Parse::EBNF::Token->new(); | |||
59 | 1 | 4 | $token->{content} = $1; | |||
60 | 1 | 3 | $token->{type} = 'literal'; | |||
61 | 1 | 5 | $rule = substr $rule, 2 + length $1; | |||
62 | ||||||
63 | }elsif ($rule =~ m!^\|!){ | |||||
64 | ||||||
65 | 2 | 7 | $token = Parse::EBNF::Token->new(); | |||
66 | 2 | 6 | $token->{type} = 'alt'; | |||
67 | 2 | 6 | $rule = substr $rule, 1; | |||
68 | ||||||
69 | }elsif ($rule =~ m!^([A-Z][a-zA-Z]*)!){ | |||||
70 | ||||||
71 | 12 | 41 | $token = Parse::EBNF::Token->new(); | |||
72 | 12 | 42 | $token->{content} = $1; | |||
73 | 12 | 36 | $token->{type} = 'subrule'; | |||
74 | 12 | 35 | $rule = substr $rule, length $1; | |||
75 | ||||||
76 | }elsif ($rule =~ m!^\[(\^?)(([^\]]|\\\])+)\]!){ | |||||
77 | ||||||
78 | # some sort of class - sub-parse it | |||||
79 | ||||||
80 | 8 | 21 | my $neg = $1; | |||
81 | 8 | 16 | my $inner = $2; | |||
82 | ||||||
83 | 8 | 30 | $rule = substr $rule, 2 + length($neg) + length($inner); | |||
84 | ||||||
85 | 8 | 18 | my $rx = '['.$neg; | |||
86 | 8 | 25 | while(length $inner){ | |||
87 | ||||||
88 | 18 | 93 | if ($inner =~ m!^#x([0-9a-f]+)-#x([0-9a-f]+)!i){ | |||
89 | ||||||
90 | 2 | 9 | $inner = substr $inner, 5 + length($1) + length($2); | |||
91 | 2 | 6 | $rx .= $self->hexchar($1).'-'.$self->hexchar($2); | |||
92 | ||||||
93 | }elsif ($inner =~ m!^#x([0-9a-f]+)!i){ | |||||
94 | ||||||
95 | 6 | 19 | $inner = substr $inner, 2 + length($1); | |||
96 | 6 | 19 | $rx .= $self->hexchar($1); | |||
97 | ||||||
98 | }elsif ($inner =~ m!^([^-])-([^-])!i){ | |||||
99 | ||||||
100 | 4 | 10 | $inner = substr $inner, 3; | |||
101 | 4 | 24 | $rx .= quotemeta($1).'-'.quotemeta($2); | |||
102 | ||||||
103 | }elsif ($inner =~ m!^([^-])!i){ | |||||
104 | ||||||
105 | 6 | 15 | $inner = substr $inner, 1; | |||
106 | 6 | 25 | $rx .= quotemeta($1); | |||
107 | ||||||
108 | }else{ | |||||
109 | ||||||
110 | 0 | 0 | $self->{error} = "couldn't parse class rx at $inner"; | |||
111 | 0 | 0 | exit; | |||
112 | } | |||||
113 | } | |||||
114 | 8 | 17 | $rx .= ']'; | |||
115 | ||||||
116 | 8 | 28 | $token = Parse::EBNF::Token->new(); | |||
117 | 8 | 22 | $token->{content} = $rx; | |||
118 | 8 | 27 | $token->{type} = 'rx'; | |||
119 | ||||||
120 | ||||||
121 | }elsif ($rule =~ m!^\[(([^\]]|\\\])+)\]!){ | |||||
122 | ||||||
123 | 0 | 0 | $token = Parse::EBNF::Token->new(); | |||
124 | 0 | 0 | $token->{content} = $1; | |||
125 | 0 | 0 | $token->{type} = 'class'; | |||
126 | 0 | 0 | $rule = substr $rule, 2 + length $1; | |||
127 | ||||||
128 | }elsif ($rule =~ m!^\*!){ | |||||
129 | ||||||
130 | 1 | 5 | $token = Parse::EBNF::Token->new(); | |||
131 | 1 | 4 | $token->{type} = 'rep star'; | |||
132 | 1 | 3 | $rule = substr $rule, 1; | |||
133 | ||||||
134 | }elsif ($rule =~ m!^\+!){ | |||||
135 | ||||||
136 | 1 | 3 | $token = Parse::EBNF::Token->new(); | |||
137 | 1 | 3 | $token->{type} = 'rep plus'; | |||
138 | 1 | 4 | $rule = substr $rule, 1; | |||
139 | ||||||
140 | }elsif ($rule =~ m!^\?!){ | |||||
141 | ||||||
142 | 1 | 4 | $token = Parse::EBNF::Token->new(); | |||
143 | 1 | 3 | $token->{type} = 'rep quest'; | |||
144 | 1 | 4 | $rule = substr $rule, 1; | |||
145 | ||||||
146 | }elsif ($rule =~ m!^\(!){ | |||||
147 | ||||||
148 | 1 | 4 | $token = Parse::EBNF::Token->new(); | |||
149 | 1 | 4 | $token->{type} = 'group start'; | |||
150 | 1 | 3 | $rule = substr $rule, 1; | |||
151 | ||||||
152 | }elsif ($rule =~ m!^\)!){ | |||||
153 | ||||||
154 | 1 | 3 | $token = Parse::EBNF::Token->new(); | |||
155 | 1 | 3 | $token->{type} = 'group end'; | |||
156 | 1 | 3 | $rule = substr $rule, 1; | |||
157 | ||||||
158 | ||||||
159 | }elsif ($rule =~ m!^\-!){ | |||||
160 | ||||||
161 | 0 | 0 | $token = Parse::EBNF::Token->new(); | |||
162 | 0 | 0 | $token->{type} = 'dash'; | |||
163 | 0 | 0 | $rule = substr $rule, 1; | |||
164 | ||||||
165 | }elsif ($rule =~ m!^#x([0-9a-f]+)!i){ | |||||
166 | ||||||
167 | 1 | 4 | $token = Parse::EBNF::Token->new(); | |||
168 | 1 | 4 | $token->{content} = $self->hexchar($1); | |||
169 | 1 | 5 | $token->{type} = 'rx'; | |||
170 | 1 | 5 | $rule = substr $rule, 2 + length $1; | |||
171 | ||||||
172 | }else{ | |||||
173 | ||||||
174 | 0 | 0 | $self->{error} = "couldn't parse token at start of $rule"; | |||
175 | 0 | 0 | return; | |||
176 | } | |||||
177 | ||||||
178 | 32 32 | 52 83 | push @{$tokens}, $token; | |||
179 | ||||||
180 | 32 | 128 | $rule =~ s/^\s+//; | |||
181 | } | |||||
182 | ||||||
183 | # | |||||
184 | # first we create a base token (of type list) | |||||
185 | # which will represent a list of tokens for this rule | |||||
186 | # | |||||
187 | ||||||
188 | 20 | 63 | my $base = Parse::EBNF::Token->new(); | |||
189 | 20 | 67 | $base->{type} = 'list'; | |||
190 | 20 | 52 | $base->{tokens} = $tokens; | |||
191 | 20 | 50 | $self->{base} = $base; | |||
192 | ||||||
193 | ||||||
194 | # | |||||
195 | # now we create a node tree from the flat list | |||||
196 | # | |||||
197 | ||||||
198 | 20 | 81 | return unless $self->produce_groups($base); | |||
199 | ||||||
200 | ||||||
201 | # | |||||
202 | # and perform recursive cleanups | |||||
203 | # | |||||
204 | ||||||
205 | 20 | 61 | unless ($base->reduce_alternations()){ | |||
206 | 0 | 0 | $self->{error} = $base->{error}; | |||
207 | 0 | 0 | return; | |||
208 | } | |||||
209 | ||||||
210 | 20 | 61 | unless ($base->reduce_repetition()){ | |||
211 | 0 | 0 | $self->{error} = $base->{error}; | |||
212 | 0 | 0 | return; | |||
213 | } | |||||
214 | ||||||
215 | # TODO: negations | |||||
216 | ||||||
217 | 20 | 61 | unless ($base->reduce_empty()){ | |||
218 | 0 | 0 | $self->{error} = $base->{error}; | |||
219 | 0 | 0 | return; | |||
220 | } | |||||
221 | ||||||
222 | 20 | 60 | unless ($base->reduce_rx()){ | |||
223 | 0 | 0 | $self->{error} = $base->{error}; | |||
224 | 0 | 0 | return; | |||
225 | } | |||||
226 | } | |||||
227 | ||||||
228 | sub hexchar { | |||||
229 | 11 | 32 | my ($self, $char) = @_; | |||
230 | ||||||
231 | 11 | 20 | $char =~ s!^0+!!; | |||
232 | ||||||
233 | 11 | 39 | if (hex($char) > 255){ | |||
234 | ||||||
235 | 0 | 0 | return '\\x{'.lc($char).'}'; | |||
236 | }else{ | |||||
237 | ||||||
238 | 11 | 60 | return '\\x'.lc($char); | |||
239 | } | |||||
240 | } | |||||
241 | ||||||
242 | sub produce_groups { | |||||
243 | 20 | 51 | my ($self, $base) = @_; | |||
244 | ||||||
245 | 20 | 52 | my $tokens = $base->{tokens}; | |||
246 | 20 | 55 | $base->{tokens} = []; | |||
247 | 20 | 36 | my $current = $base; | |||
248 | ||||||
249 | 20 52 | 33 196 | while(my $token = shift @{$tokens}){ | |||
250 | ||||||
251 | 32 | 171 | if ($token->{type} eq 'group start'){ | |||
252 | ||||||
253 | 1 | 3 | my $parent = Parse::EBNF::Token->new(); | |||
254 | 1 | 4 | $parent->{type} = 'list'; | |||
255 | 1 | 3 | $parent->{parent} = $current; | |||
256 | 1 | 3 | $parent->{tokens} = []; | |||
257 | ||||||
258 | 1 1 | 2 3 | push @{$current->{tokens}}, $parent; | |||
259 | ||||||
260 | 1 | 6 | $current = $parent; | |||
261 | ||||||
262 | }elsif ($token->{type} eq 'group end'){ | |||||
263 | ||||||
264 | 1 | 2 | $current = $current->{parent}; | |||
265 | ||||||
266 | 1 | 4 | if (!defined($current)){ | |||
267 | 0 | 0 | $self->{error} = "end of group found without matching begin in rule $self->{rule}"; | |||
268 | 0 | 0 | return 0; | |||
269 | } | |||||
270 | ||||||
271 | }else{ | |||||
272 | 30 30 | 51 110 | push @{$current->{tokens}}, $token; | |||
273 | } | |||||
274 | ||||||
275 | } | |||||
276 | ||||||
277 | 20 | 63 | return 1; | |||
278 | } | |||||
279 | ||||||
280 | sub has_error { | |||||
281 | 21 | 52 | my ($self) = @_; | |||
282 | 21 | 119 | return $self->{error} ? 1 : 0; | |||
283 | } | |||||
284 | ||||||
285 | sub error { | |||||
286 | 0 | 0 | my ($self) = @_; | |||
287 | 0 | 0 | return $self->{error} ? $self->{error} : ''; | |||
288 | } | |||||
289 | ||||||
290 | sub base_token { | |||||
291 | 54 | 140 | my ($self) = @_; | |||
292 | 54 | 358 | return $self->{base}; | |||
293 | } | |||||
294 | ||||||
295 | 1; | |||||
296 |