| File: | blib/lib/XML/Parser/LiteCopy.pm |
| Coverage: | 93.6% |
| line | stmt | bran | cond | sub | time | code |
|---|---|---|---|---|---|---|
| 1 | # NOTE: This module originally came from SOAP::Lite, which you probably | |||||
| 2 | # don't have. It was first repackaged here just to avoid the huge | |||||
| 3 | # dependancy tree, but this version has several features (CDATA | |||||
| 4 | # support, better PI and Comment support) that have been added. | |||||
| 5 | ||||||
| 6 | # | |||||
| 7 | # Copyright (C) 2000-2007 Paul Kulchenko (paulclinger@yahoo.com) | |||||
| 8 | # Copyright (C) 2008 Martin Kutter (martin.kutter@fen-net.de) | |||||
| 9 | # Copyright (C) 2009 Cal Henderson (cal@iamcal.com) | |||||
| 10 | # | |||||
| 11 | # SOAP::Lite is free software; you can redistribute it | |||||
| 12 | # and/or modify it under the same terms as Perl itself. | |||||
| 13 | # | |||||
| 14 | ||||||
| 15 | package XML::Parser::LiteCopy; | |||||
| 16 | ||||||
| 17 | 9 9 9 | 36 16 42 | use strict; | |||
| 18 | 9 9 9 | 44 19 59 | use vars qw($VERSION); | |||
| 19 | $VERSION = '0.720.00'; | |||||
| 20 | ||||||
| 21 | my $ReturnErrors = 0; | |||||
| 22 | ||||||
| 23 | sub new { | |||||
| 24 | 48 | 995 | my $class = shift; | |||
| 25 | ||||||
| 26 | 48 | 156 | return $class if ref $class; | |||
| 27 | 48 | 162 | my $self = bless {} => $class; | |||
| 28 | ||||||
| 29 | 48 | 157 | my %parameters = @_; | |||
| 30 | 48 | 152 | $self->setHandlers(); # clear first | |||
| 31 | 48 48 | 95 309 | $self->setHandlers(%{$parameters{Handlers} || {}}); | |||
| 32 | ||||||
| 33 | 48 | 317 | $ReturnErrors = $parameters{ReturnErrors} || 0; | |||
| 34 | ||||||
| 35 | 48 | 170 | return $self; | |||
| 36 | } | |||||
| 37 | ||||||
| 38 | sub setHandlers { | |||||
| 39 | 106 | 284 | my $self = shift; | |||
| 40 | ||||||
| 41 | # allow symbolic refs, avoid "subroutine redefined" warnings | |||||
| 42 | 9 9 9 106 | 53 15 37 239 | no strict 'refs'; local $^W; | |||
| 43 | # clear all handlers if called without parameters | |||||
| 44 | 106 | 380 | if (not @_) { | |||
| 45 | 64 | 291 | for (qw(Start End Char Final Init CData Comment Doctype PI Error)) { | |||
| 46 | 163 | 340 | *$_ = sub {} | |||
| 47 | 640 | 2594 | } | |||
| 48 | } | |||||
| 49 | ||||||
| 50 | # we could use each here, too... | |||||
| 51 | 106 | 370 | while (@_) { | |||
| 52 | 112 | 401 | my($name, $func) = splice(@_, 0, 2); | |||
| 53 | *$name = defined $func | |||||
| 54 | ? $func | |||||
| 55 | 0 | 0 | : sub {} | |||
| 56 | 112 | 623 | } | |||
| 57 | 106 | 259 | return $self; | |||
| 58 | } | |||||
| 59 | ||||||
| 60 | sub _regexp { | |||||
| 61 | 18 | 126 | my $patch = shift || ''; | |||
| 62 | 18 | 41 | my $package = __PACKAGE__; | |||
| 63 | ||||||
| 64 | # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html | |||||
| 65 | ||||||
| 66 | # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", | |||||
| 67 | # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998. | |||||
| 68 | # Copyright (c) 1998, Robert D. Cameron. | |||||
| 69 | # The following code may be freely used and distributed provided that | |||||
| 70 | # this copyright and citation notice remains intact and that modifications | |||||
| 71 | # or additions are clearly identified. | |||||
| 72 | ||||||
| 73 | 9 9 9 | 58 15 71 | use re 'eval'; | |||
| 74 | 18 | 49 | my $TextSE = "[^<]+"; | |||
| 75 | ||||||
| 76 | # the following backrefs have been added: | |||||
| 77 | # 1 : TextSE | |||||
| 78 | # 2 : MarkupSPE / DeclCE / CommentCE | |||||
| 79 | # 3 : MarkupSPE / DeclCE / CDATA_CE | |||||
| 80 | # 4 : MarkupSPE / DeclCE / DocTypeCE | |||||
| 81 | # 5 : MarkupSPE / PI_CE | |||||
| 82 | # 6 : MarkupSPE / EndTagCE | |||||
| 83 | # 7+: MarkupSPE / ElemTagCE | |||||
| 84 | ||||||
| 85 | 18 | 43 | my $Until2Hyphens = "(?:[^-]*)-(?:[^-]+-)*-"; | |||
| 86 | 18 | 85 | my $CommentCE = "($Until2Hyphens)(?{${package}::comment(\$2)})>?"; | |||
| 87 | ||||||
| 88 | 18 | 42 | my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; | |||
| 89 | 18 | 94 | my $CDATA_CE = "($UntilRSBs(?:[^\\]>]$UntilRSBs)*)(?{${package}::cdata(\$3)})>"; | |||
| 90 | ||||||
| 91 | 18 | 43 | my $S = "[ \\n\\t\\r]+"; | |||
| 92 | 18 | 42 | my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; | |||
| 93 | 18 | 40 | my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; | |||
| 94 | 18 | 79 | my $Name = "(?:$NameStrt)(?:$NameChar)*"; | |||
| 95 | 18 | 47 | my $QuoteSE = "\"[^\"]*\"|'[^']*'"; | |||
| 96 | 18 | 105 | my $DT_IdentSE = "$Name(?:$S(?:$Name|$QuoteSE))*"; | |||
| 97 | 18 | 57 | my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; | |||
| 98 | 18 | 43 | my $S1 = "[\\n\\r\\t ]"; | |||
| 99 | 18 | 40 | my $UntilQMs = "[^?]*\\?+"; | |||
| 100 | ||||||
| 101 | 18 | 86 | my $PI_Tail = "\\?|$S1$UntilQMs(?:[^>?]$UntilQMs)*"; | |||
| 102 | 18 | 140 | my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail>))|%$Name;|$S"; | |||
| 103 | 18 | 153 | my $DocTypeCE = "$S($DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?)>(?{${package}::_doctype(\$4)})"; | |||
| 104 | ||||||
| 105 | 18 | 101 | my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; | |||
| 106 | ||||||
| 107 | 18 | 101 | my $PI_CE = "($Name(?:$PI_Tail))>(?{${package}::_pi(\$5)})"; | |||
| 108 | ||||||
| 109 | # these expressions were modified for backtracking and events | |||||
| 110 | ||||||
| 111 | 18 | 92 | my $EndTagCE = "($Name)(?{${package}::_end(\$6)})(?:$S)?>"; | |||
| 112 | 18 | 38 | my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'"; | |||
| 113 | ||||||
| 114 | 18 | 238 | my $ElemTagCE = "($Name)" | |||
| 115 | . "(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)" | |||||
| 116 | . "(?{[\@{\$^R||[]},\$8=>defined\$9?\$9:\$10]}))*(?:$S)?(/)?>" | |||||
| 117 | . "(?{${package}::_start(\$7,\@{\$^R||[]}),\$^R=[]})(?{\$11 and ${package}::_end(\$7)})"; | |||||
| 118 | ||||||
| 119 | 18 | 136 | my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; | |||
| 120 | ||||||
| 121 | # Next expression is under "black magic". | |||||
| 122 | # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', | |||||
| 123 | # but it doesn't work under Perl 5.005 and only magic with | |||||
| 124 | # (?:....)?? solved the problem. | |||||
| 125 | # I would appreciate if someone let me know what is the right thing to do | |||||
| 126 | # and what's the reason for all this magic. | |||||
| 127 | # Seems like a problem related to (?:....)? rather than to ?{} feature. | |||||
| 128 | # Tests are in t/31-xmlparserlite.t if you decide to play with it. | |||||
| 129 | #"(?{[]})(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE"; | |||||
| 130 | 18 | 151 | "(?:($TextSE)(?{${package}::_char(\$1)}))$patch|$MarkupSPE"; | |||
| 131 | } | |||||
| 132 | ||||||
| 133 | setHandlers(); | |||||
| 134 | ||||||
| 135 | # Try 5.6 and 5.10 regex first | |||||
| 136 | my $REGEXP = _regexp('??'); | |||||
| 137 | ||||||
| 138 | sub _parse_re { | |||||
| 139 | 9 9 9 | 55 14 39 | use re "eval"; | |||
| 140 | undef $^R; | |||||
| 141 | 1 while $_[0] =~ m{$REGEXP}go | |||||
| 142 | }; | |||||
| 143 | ||||||
| 144 | # fixup regex if it does not work... | |||||
| 145 | { | |||||
| 146 | if (not eval { _parse_re('<soap:foo xmlns:soap="foo">bar</soap:foo>'); 1; } ) { | |||||
| 147 | $REGEXP = _regexp(); | |||||
| 148 | local $^W; | |||||
| 149 | *_parse_re = sub { | |||||
| 150 | 9 9 9 | 54 19 36 | use re "eval"; | |||
| 151 | 48 | 92 | undef $^R; | |||
| 152 | 48 | 74 | 1 while $_[0] =~ m{$REGEXP}go | |||
| 153 | }; | |||||
| 154 | } | |||||
| 155 | } | |||||
| 156 | ||||||
| 157 | sub parse { | |||||
| 158 | 48 | 428 | _init(); | |||
| 159 | 48 | 170 | _parse_re($_[1]); | |||
| 160 | 44 | 376 | _final(); | |||
| 161 | } | |||||
| 162 | ||||||
| 163 | my(@stack, $level); | |||||
| 164 | ||||||
| 165 | sub _init { | |||||
| 166 | 48 | 108 | @stack = (); | |||
| 167 | 48 | 99 | $level = 0; | |||
| 168 | 48 | 146 | Init(__PACKAGE__, @_); | |||
| 169 | } | |||||
| 170 | ||||||
| 171 | sub _final { | |||||
| 172 | 44 | 141 | return _error("not properly closed tag '$stack[-1]'") if @stack; | |||
| 173 | 42 | 122 | return _error("no element found") unless $level; | |||
| 174 | 40 | 116 | Final(__PACKAGE__, @_) | |||
| 175 | } | |||||
| 176 | ||||||
| 177 | sub _start { | |||||
| 178 | 71 | 378 | return _error("multiple roots, wrong element '$_[0]'") if $level++ && !@stack; | |||
| 179 | 69 | 218 | push(@stack, $_[0]); | |||
| 180 | 69 | 238 | Start(__PACKAGE__, @_); | |||
| 181 | } | |||||
| 182 | ||||||
| 183 | sub _char { | |||||
| 184 | 65 | 302 | Char(__PACKAGE__, $_[0]), return if @stack; | |||
| 185 | ||||||
| 186 | # check for junk before or after element | |||||
| 187 | # can't use split or regexp due to limitations in ?{} implementation, | |||||
| 188 | # will iterate with loop, but we'll do it no more than two times, so | |||||
| 189 | # it shouldn't affect performance | |||||
| 190 | for (my $i=0; $i < length $_[0]; $i++) { | |||||
| 191 | 33 13 | 334 101 | return _error("junk '$_[0]' @{[$level ? 'after' : 'before']} XML element") | |||
| 192 | if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there | |||||
| 193 | 25 | 62 | } | |||
| 194 | } | |||||
| 195 | ||||||
| 196 | sub _end { | |||||
| 197 | 67 | 210 | return _error("unexpected closing tag '$_[0]'") if !@stack; | |||
| 198 | 66 | 277 | pop(@stack) eq $_[0] or return _error("mismatched tag '$_[0]'"); | |||
| 199 | 63 | 245 | End(__PACKAGE__, $_[0]); | |||
| 200 | } | |||||
| 201 | ||||||
| 202 | sub comment { | |||||
| 203 | 9 | 44 | Comment(__PACKAGE__, substr $_[0], 0, -2); | |||
| 204 | } | |||||
| 205 | ||||||
| 206 | sub end { | |||||
| 207 | 0 | 0 | pop(@stack) eq $_[0] or return _error("mismatched tag '$_[0]'"); | |||
| 208 | 0 | 0 | End(__PACKAGE__, $_[0]); | |||
| 209 | } | |||||
| 210 | ||||||
| 211 | sub cdata { | |||||
| 212 | 9 | 25 | return _error("CDATA outside of tag stack") unless @stack; | |||
| 213 | 9 | 47 | CData(__PACKAGE__, substr $_[0], 0, -2); | |||
| 214 | } | |||||
| 215 | ||||||
| 216 | sub _doctype { | |||||
| 217 | 1 | 4 | Doctype(__PACKAGE__, $_[0]); | |||
| 218 | } | |||||
| 219 | ||||||
| 220 | sub _pi { | |||||
| 221 | 6 | 32 | PI(__PACKAGE__, substr $_[0], 0, -1); | |||
| 222 | } | |||||
| 223 | ||||||
| 224 | sub _error { | |||||
| 225 | 23 | 71 | if ($ReturnErrors){ | |||
| 226 | 8 | 29 | Error(__PACKAGE__, $_[0]); | |||
| 227 | 8 | 51 | return; | |||
| 228 | } | |||||
| 229 | 15 | 38 | die "$_[0]\n"; | |||
| 230 | } | |||||
| 231 | ||||||
| 232 | # ====================================================================== | |||||
| 233 | 1; | |||||
| 234 | ||||||