File: | blib/lib/XML/Parser/LiteCopy.pm |
Coverage: | 93.6% |
line | stmt | bran | cond | sub | time | code |
---|---|---|---|---|---|---|
1 | # NOTE: This module originally came from SOAP::Lite, which you probably | |||||
2 | # don't have. It was first repackaged here just to avoid the huge | |||||
3 | # dependancy tree, but this version has several features (CDATA | |||||
4 | # support, better PI and Comment support) that have been added. | |||||
5 | ||||||
6 | # | |||||
7 | # Copyright (C) 2000-2007 Paul Kulchenko (paulclinger@yahoo.com) | |||||
8 | # Copyright (C) 2008 Martin Kutter (martin.kutter@fen-net.de) | |||||
9 | # Copyright (C) 2009 Cal Henderson (cal@iamcal.com) | |||||
10 | # | |||||
11 | # SOAP::Lite is free software; you can redistribute it | |||||
12 | # and/or modify it under the same terms as Perl itself. | |||||
13 | # | |||||
14 | ||||||
15 | package XML::Parser::LiteCopy; | |||||
16 | ||||||
17 | 9 9 9 | 36 16 42 | use strict; | |||
18 | 9 9 9 | 44 19 59 | use vars qw($VERSION); | |||
19 | $VERSION = '0.720.00'; | |||||
20 | ||||||
21 | my $ReturnErrors = 0; | |||||
22 | ||||||
23 | sub new { | |||||
24 | 48 | 995 | my $class = shift; | |||
25 | ||||||
26 | 48 | 156 | return $class if ref $class; | |||
27 | 48 | 162 | my $self = bless {} => $class; | |||
28 | ||||||
29 | 48 | 157 | my %parameters = @_; | |||
30 | 48 | 152 | $self->setHandlers(); # clear first | |||
31 | 48 48 | 95 309 | $self->setHandlers(%{$parameters{Handlers} || {}}); | |||
32 | ||||||
33 | 48 | 317 | $ReturnErrors = $parameters{ReturnErrors} || 0; | |||
34 | ||||||
35 | 48 | 170 | return $self; | |||
36 | } | |||||
37 | ||||||
38 | sub setHandlers { | |||||
39 | 106 | 284 | my $self = shift; | |||
40 | ||||||
41 | # allow symbolic refs, avoid "subroutine redefined" warnings | |||||
42 | 9 9 9 106 | 53 15 37 239 | no strict 'refs'; local $^W; | |||
43 | # clear all handlers if called without parameters | |||||
44 | 106 | 380 | if (not @_) { | |||
45 | 64 | 291 | for (qw(Start End Char Final Init CData Comment Doctype PI Error)) { | |||
46 | 163 | 340 | *$_ = sub {} | |||
47 | 640 | 2594 | } | |||
48 | } | |||||
49 | ||||||
50 | # we could use each here, too... | |||||
51 | 106 | 370 | while (@_) { | |||
52 | 112 | 401 | my($name, $func) = splice(@_, 0, 2); | |||
53 | *$name = defined $func | |||||
54 | ? $func | |||||
55 | 0 | 0 | : sub {} | |||
56 | 112 | 623 | } | |||
57 | 106 | 259 | return $self; | |||
58 | } | |||||
59 | ||||||
60 | sub _regexp { | |||||
61 | 18 | 126 | my $patch = shift || ''; | |||
62 | 18 | 41 | my $package = __PACKAGE__; | |||
63 | ||||||
64 | # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html | |||||
65 | ||||||
66 | # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", | |||||
67 | # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998. | |||||
68 | # Copyright (c) 1998, Robert D. Cameron. | |||||
69 | # The following code may be freely used and distributed provided that | |||||
70 | # this copyright and citation notice remains intact and that modifications | |||||
71 | # or additions are clearly identified. | |||||
72 | ||||||
73 | 9 9 9 | 58 15 71 | use re 'eval'; | |||
74 | 18 | 49 | my $TextSE = "[^<]+"; | |||
75 | ||||||
76 | # the following backrefs have been added: | |||||
77 | # 1 : TextSE | |||||
78 | # 2 : MarkupSPE / DeclCE / CommentCE | |||||
79 | # 3 : MarkupSPE / DeclCE / CDATA_CE | |||||
80 | # 4 : MarkupSPE / DeclCE / DocTypeCE | |||||
81 | # 5 : MarkupSPE / PI_CE | |||||
82 | # 6 : MarkupSPE / EndTagCE | |||||
83 | # 7+: MarkupSPE / ElemTagCE | |||||
84 | ||||||
85 | 18 | 43 | my $Until2Hyphens = "(?:[^-]*)-(?:[^-]+-)*-"; | |||
86 | 18 | 85 | my $CommentCE = "($Until2Hyphens)(?{${package}::comment(\$2)})>?"; | |||
87 | ||||||
88 | 18 | 42 | my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; | |||
89 | 18 | 94 | my $CDATA_CE = "($UntilRSBs(?:[^\\]>]$UntilRSBs)*)(?{${package}::cdata(\$3)})>"; | |||
90 | ||||||
91 | 18 | 43 | my $S = "[ \\n\\t\\r]+"; | |||
92 | 18 | 42 | my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; | |||
93 | 18 | 40 | my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; | |||
94 | 18 | 79 | my $Name = "(?:$NameStrt)(?:$NameChar)*"; | |||
95 | 18 | 47 | my $QuoteSE = "\"[^\"]*\"|'[^']*'"; | |||
96 | 18 | 105 | my $DT_IdentSE = "$Name(?:$S(?:$Name|$QuoteSE))*"; | |||
97 | 18 | 57 | my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; | |||
98 | 18 | 43 | my $S1 = "[\\n\\r\\t ]"; | |||
99 | 18 | 40 | my $UntilQMs = "[^?]*\\?+"; | |||
100 | ||||||
101 | 18 | 86 | my $PI_Tail = "\\?|$S1$UntilQMs(?:[^>?]$UntilQMs)*"; | |||
102 | 18 | 140 | my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail>))|%$Name;|$S"; | |||
103 | 18 | 153 | my $DocTypeCE = "$S($DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?)>(?{${package}::_doctype(\$4)})"; | |||
104 | ||||||
105 | 18 | 101 | my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; | |||
106 | ||||||
107 | 18 | 101 | my $PI_CE = "($Name(?:$PI_Tail))>(?{${package}::_pi(\$5)})"; | |||
108 | ||||||
109 | # these expressions were modified for backtracking and events | |||||
110 | ||||||
111 | 18 | 92 | my $EndTagCE = "($Name)(?{${package}::_end(\$6)})(?:$S)?>"; | |||
112 | 18 | 38 | my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'"; | |||
113 | ||||||
114 | 18 | 238 | my $ElemTagCE = "($Name)" | |||
115 | . "(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)" | |||||
116 | . "(?{[\@{\$^R||[]},\$8=>defined\$9?\$9:\$10]}))*(?:$S)?(/)?>" | |||||
117 | . "(?{${package}::_start(\$7,\@{\$^R||[]}),\$^R=[]})(?{\$11 and ${package}::_end(\$7)})"; | |||||
118 | ||||||
119 | 18 | 136 | my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; | |||
120 | ||||||
121 | # Next expression is under "black magic". | |||||
122 | # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', | |||||
123 | # but it doesn't work under Perl 5.005 and only magic with | |||||
124 | # (?:....)?? solved the problem. | |||||
125 | # I would appreciate if someone let me know what is the right thing to do | |||||
126 | # and what's the reason for all this magic. | |||||
127 | # Seems like a problem related to (?:....)? rather than to ?{} feature. | |||||
128 | # Tests are in t/31-xmlparserlite.t if you decide to play with it. | |||||
129 | #"(?{[]})(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE"; | |||||
130 | 18 | 151 | "(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE"; | |||
131 | } | |||||
132 | ||||||
133 | setHandlers(); | |||||
134 | ||||||
135 | # Try 5.6 and 5.10 regex first | |||||
136 | my $REGEXP = _regexp('??'); | |||||
137 | ||||||
138 | sub _parse_re { | |||||
139 | 9 9 9 | 55 14 39 | use re "eval"; | |||
140 | undef $^R; | |||||
141 | 1 while $_[0] =~ m{$REGEXP}go | |||||
142 | }; | |||||
143 | ||||||
144 | # fixup regex if it does not work... | |||||
145 | { | |||||
146 | if (not eval { _parse_re('<soap:foo xmlns:soap="foo">bar</soap:foo>'); 1; } ) { | |||||
147 | $REGEXP = _regexp(); | |||||
148 | local $^W; | |||||
149 | *_parse_re = sub { | |||||
150 | 9 9 9 | 54 19 36 | use re "eval"; | |||
151 | 48 | 92 | undef $^R; | |||
152 | 48 | 74 | 1 while $_[0] =~ m{$REGEXP}go | |||
153 | }; | |||||
154 | } | |||||
155 | } | |||||
156 | ||||||
157 | sub parse { | |||||
158 | 48 | 428 | _init(); | |||
159 | 48 | 170 | _parse_re($_[1]); | |||
160 | 44 | 376 | _final(); | |||
161 | } | |||||
162 | ||||||
163 | my(@stack, $level); | |||||
164 | ||||||
165 | sub _init { | |||||
166 | 48 | 108 | @stack = (); | |||
167 | 48 | 99 | $level = 0; | |||
168 | 48 | 146 | Init(__PACKAGE__, @_); | |||
169 | } | |||||
170 | ||||||
171 | sub _final { | |||||
172 | 44 | 141 | return _error("not properly closed tag '$stack[-1]'") if @stack; | |||
173 | 42 | 122 | return _error("no element found") unless $level; | |||
174 | 40 | 116 | Final(__PACKAGE__, @_) | |||
175 | } | |||||
176 | ||||||
177 | sub _start { | |||||
178 | 71 | 378 | return _error("multiple roots, wrong element '$_[0]'") if $level++ && !@stack; | |||
179 | 69 | 218 | push(@stack, $_[0]); | |||
180 | 69 | 238 | Start(__PACKAGE__, @_); | |||
181 | } | |||||
182 | ||||||
183 | sub _char { | |||||
184 | 65 | 302 | Char(__PACKAGE__, $_[0]), return if @stack; | |||
185 | ||||||
186 | # check for junk before or after element | |||||
187 | # can't use split or regexp due to limitations in ?{} implementation, | |||||
188 | # will iterate with loop, but we'll do it no more than two times, so | |||||
189 | # it shouldn't affect performance | |||||
190 | for (my $i=0; $i < length $_[0]; $i++) { | |||||
191 | 33 13 | 334 101 | return _error("junk '$_[0]' @{[$level ? 'after' : 'before']} XML element") | |||
192 | if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there | |||||
193 | 25 | 62 | } | |||
194 | } | |||||
195 | ||||||
196 | sub _end { | |||||
197 | 67 | 210 | return _error("unexpected closing tag '$_[0]'") if !@stack; | |||
198 | 66 | 277 | pop(@stack) eq $_[0] or return _error("mismatched tag '$_[0]'"); | |||
199 | 63 | 245 | End(__PACKAGE__, $_[0]); | |||
200 | } | |||||
201 | ||||||
202 | sub comment { | |||||
203 | 9 | 44 | Comment(__PACKAGE__, substr $_[0], 0, -2); | |||
204 | } | |||||
205 | ||||||
206 | sub end { | |||||
207 | 0 | 0 | pop(@stack) eq $_[0] or return _error("mismatched tag '$_[0]'"); | |||
208 | 0 | 0 | End(__PACKAGE__, $_[0]); | |||
209 | } | |||||
210 | ||||||
211 | sub cdata { | |||||
212 | 9 | 25 | return _error("CDATA outside of tag stack") unless @stack; | |||
213 | 9 | 47 | CData(__PACKAGE__, substr $_[0], 0, -2); | |||
214 | } | |||||
215 | ||||||
216 | sub _doctype { | |||||
217 | 1 | 4 | Doctype(__PACKAGE__, $_[0]); | |||
218 | } | |||||
219 | ||||||
220 | sub _pi { | |||||
221 | 6 | 32 | PI(__PACKAGE__, substr $_[0], 0, -1); | |||
222 | } | |||||
223 | ||||||
224 | sub _error { | |||||
225 | 23 | 71 | if ($ReturnErrors){ | |||
226 | 8 | 29 | Error(__PACKAGE__, $_[0]); | |||
227 | 8 | 51 | return; | |||
228 | } | |||||
229 | 15 | 38 | die "$_[0]\n"; | |||
230 | } | |||||
231 | ||||||
232 | # ====================================================================== | |||||
233 | 1; | |||||
234 |