1 package CharClass::Matcher;
4 use warnings FATAL => 'all';
5 use Text::Wrap qw(wrap);
8 $Data::Dumper::Useqq= 1;
9 our $hex_fmt= "0x%02X";
13 CharClass::Matcher -- Generate C macros that match character classes efficiently
17 perl Porting/regcharclass.pl
21 Dynamically generates macros for detecting special charclasses
22 in latin-1, utf8, and codepoint forms. Macros can be set to return
23 the length (in bytes) of the matched codepoint, or the codepoint itself.
25 To regenerate regcharclass.h, run this script from perl-root. No arguments
28 Using WHATEVER as an example the following macros will be produced:
32 =item is_WHATEVER(s,is_utf8)
34 =item is_WHATEVER_safe(s,e,is_utf8)
36 Do a lookup as appropriate based on the is_utf8 flag. When possible
37 comparisons involving octect<128 are done before checking the is_utf8
38 flag, hopefully saving time.
40 =item is_WHATEVER_utf8(s)
42 =item is_WHATEVER_utf8_safe(s,e)
44 Do a lookup assuming the string is encoded in (normalized) UTF8.
46 =item is_WHATEVER_latin1(s)
48 =item is_WHATEVER_latin1_safe(s,e)
50 Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
52 =item is_WHATEVER_cp(cp)
54 Check to see if the string matches a given codepoint (hypotethically a
55 U32). The condition is constructed as as to "break out" as early as
56 possible if the codepoint is out of range of the condition.
60 (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
62 Thus if the character is X+1 only two comparisons will be done. Making
63 matching lookups slower, but non-matching faster.
67 Additionally it is possible to generate C<what_> variants that return
68 the codepoint read instead of the number of octets read, this can be
69 done by suffixing '-cp' to the type description.
73 perltidy -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
78 Author: Yves Orton (demerphq) 2007
82 No tests directly here (although the regex engine will fail tests
83 if this code is broken). Insufficient documentation and no Getopts
84 handler for using the module as a script.
88 You may distribute under the terms of either the GNU General Public
89 License or the Artistic License, as specified in the README file.
93 # Sub naming convention:
94 # __func : private subroutine, can not be called as a method
95 # _func : private method, not meant for external use
96 # func : public method.
99 #-------------------------------------------------------------------------------
101 # ($cp,$n,$l,$u)=__uni_latin($str);
103 # Return a list of arrays, each of which when interepreted correctly
104 # represent the string in some given encoding with specific conditions.
106 # $cp - list of codepoints that make up the string.
107 # $n - list of octets that make up the string if all codepoints < 128
108 # $l - list of octets that make up the string in latin1 encoding if all
109 # codepoints < 256, and at least one codepoint is >127.
110 # $u - list of octets that make up the string in utf8 if any codepoint >127
113 #-----------+----------
123 for my $ch ( split //, $str ) {
126 $max= $cp if $max < $cp;
132 $l= [@cp] if $max && $max < 256;
134 my $copy= $str; # must copy string, FB_CROAK makes encode destructive
135 $u= eval { Encode::encode( "utf8", $copy, Encode::FB_CROAK ) };
136 # $u is utf8 but with the utf8 flag OFF
137 # therefore "C*" gets us the values of the bytes involved.
138 $u= [ unpack "C*", $u ] if defined $u;
140 return ( \@cp, $n, $l, $u );
144 # $clean= __clean($expr);
146 # Cleanup a ternary expression, removing unnecessary parens and apply some
147 # simplifications using regexes.
153 $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
155 #print "$parens\n$expr\n";
156 1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
157 1 while $expr =~ s/ \( \s* ($parens) \s* \? \s*
158 \( \s* ($parens) \s* \? \s* ($parens|[^:]+?) \s* : \s* ($parens|[^)]+?) \s* \)
159 \s* : \s* \4 \s* \)/( ( $1 && $2 ) ? $3 : 0 )/gx;
164 # $text= __macro(@args);
165 # Join args together by newlines, and then neatly add backslashes to the end
166 # of every line as expected by the C pre-processor for #define's.
170 my $str= join "\n", @_;
172 my @lines= map { s/\s+$//; s/\t/ /g; $_ } split /\n/, $str;
173 my $last= pop @lines;
174 $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
175 1 while $str =~ s/^(\t*) {8}/$1\t/gm;
180 # my $op=__incrdepth($op);
182 # take an 'op' hashref and add one to it and all its childrens depths.
187 return unless ref $op;
189 __incrdepth( $op->{yes} );
190 __incrdepth( $op->{no} );
194 # join two branches of an opcode together with a condition, incrementing
195 # the depth on the yes branch when we do so.
196 # returns the new root opcode of the tree.
198 my ( $cond, $yes, $no )= @_;
201 yes => __incrdepth( $yes ),
211 # my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
213 # Create a new CharClass::Matcher object by parsing the text in
214 # the txt array. Currently applies the following rules:
216 # Element starts with C<0x>, line is evaled the result treated as
217 # a number which is passed to chr().
219 # Element starts with C<">, line is evaled and the result treated
222 # Each string is then stored in the 'strs' subhash as a hash record
223 # made up of the results of __uni_latin1, using the keynames
224 # 'low','latin1','utf8', as well as the synthesized 'LATIN1' and
225 # 'UTF8' which hold a merge of 'low' and their lowercase equivelents.
227 # Size data is tracked per type in the 'size' subhash.
235 die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
241 title => $opt{title} || '',
243 foreach my $txt ( @{ $opt{txt} } ) {
245 if ( $str =~ /^[""]/ ) {
247 } elsif ( $str =~ /^0x/ ) {
250 die "Unparseable line: $txt\n";
254 my ( $cp, $low, $latin1, $utf8 )= __uni_latin1( $str );
255 my $UTF8= $low || $utf8;
256 my $LATIN1= $low || $latin1;
257 #die Dumper($txt,$cp,$low,$latin1,$utf8)
258 # if $txt=~/NEL/ or $utf8 and @$utf8>3;
260 @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 cp UTF8 LATIN1 )}=
261 ( $str, $txt, $low, $utf8, $latin1, $cp, $UTF8, $LATIN1 );
262 my $rec= $self->{strs}{$str};
263 foreach my $key ( qw(low utf8 latin1 cp UTF8 LATIN1) ) {
264 $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
265 if $self->{strs}{$str}{$key};
267 $self->{has_multi} ||= @$cp > 1;
268 $self->{has_ascii} ||= $latin1 && @$latin1;
269 $self->{has_low} ||= $low && @$low;
270 $self->{has_high} ||= !$low && !$latin1;
272 $self->{val_fmt}= $hex_fmt;
273 $self->{count}= 0 + keys %{ $self->{strs} };
277 # my $trie = make_trie($type,$maxlen);
279 # using the data stored in the object build a trie of a specifc type,
280 # and with specific maximum depth. The trie is made up the elements of
281 # the given types array for each string in the object (assuming it is
284 # returns the trie, or undef if there was no relevent data in the object.
288 my ( $self, $type, $maxlen )= @_;
290 my $strs= $self->{strs};
292 foreach my $rec ( values %$strs ) {
293 die "panic: unknown type '$type'"
294 if !exists $rec->{$type};
295 my $dat= $rec->{$type};
297 next if $maxlen && @$dat > $maxlen;
299 foreach my $elem ( @$dat ) {
300 $node->{$elem} ||= {};
301 $node= $node->{$elem};
303 $node->{''}= $rec->{str};
305 return 0 + keys( %trie ) ? \%trie : undef;
308 # my $optree= _optree()
310 # recursively convert a trie to an optree where every node represents
316 my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
317 return unless defined $trie;
318 if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
319 die "Can't do 'cp' optree from multi-codepoint strings";
322 $else= 0 unless defined $else;
323 $depth= 0 unless defined $depth;
325 my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
327 if ( $ret_type eq 'cp' ) {
328 $else= $self->{strs}{ $trie->{''} }{cp}[0];
329 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
330 } elsif ( $ret_type eq 'len' ) {
332 } elsif ( $ret_type eq 'both') {
333 $else= $self->{strs}{ $trie->{''} }{cp}[0];
334 $else= sprintf "$self->{val_fmt}", $else if $else > 9;
335 $else= "len=$depth, $else";
338 return $else if !@conds;
341 my ( $yes_res, $as_code, @cond );
342 my $test= $test_type eq 'cp' ? "cp" : "((U8*)s)[$depth]";
344 $node->{vals}= [@cond];
345 $node->{test}= $test;
346 $node->{yes}= $yes_res;
347 $node->{depth}= $depth;
351 my $cond= shift @conds;
353 $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else,
355 my $res_code= Dumper( $res );
356 if ( !$yes_res || $res_code ne $as_code ) {
361 ( $yes_res, $as_code )= ( $res, $res_code );
371 # my $optree= optree(%opts);
373 # Convert a trie to an optree, wrapper for _optree
378 my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
379 $opt{ret_type} ||= 'len';
380 my $test_type= $opt{type} eq 'cp' ? 'cp' : 'depth';
381 return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
384 # my $optree= generic_optree(%opts);
386 # build a "generic" optree out of the three 'low', 'latin1', 'utf8'
387 # sets of strings, including a branch for handling the string type check.
394 $opt{ret_type} ||= 'len';
395 my $test_type= 'depth';
396 my $else= $opt{else} || 0;
398 my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
399 my $utf8= $self->make_trie( 'utf8', $opt{max_depth} );
401 $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
405 $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
406 } elsif ( $latin1 ) {
407 $else= __cond_join( "!( is_utf8 )", $latin1, $else );
409 my $low= $self->make_trie( 'low', $opt{max_depth} );
411 $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
419 # create a string length guarded optree.
425 my $type= $opt{type};
427 die "Can't do a length_optree on type 'cp', makes no sense."
430 my ( @size, $method );
432 if ( $type eq 'generic' ) {
433 $method= 'generic_optree';
435 %{ $self->{size}{low} || {} },
436 %{ $self->{size}{latin1} || {} },
437 %{ $self->{size}{utf8} || {} }
439 @size= sort { $a <=> $b } keys %sizes;
442 @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
445 my $else= ( $opt{else} ||= 0 );
446 for my $size ( @size ) {
447 my $optree= $self->$method( %opt, type => $type, max_depth => $size );
448 my $cond= "((e)-(s) > " . ( $size - 1 ).")";
449 $else= __cond_join( $cond, $optree, $else );
455 # turn a list of conditions into a text expression
456 # - merges ranges of conditions, and joins the result with ||
458 my ( $self, $op, $combine )= @_;
459 my $cond= $op->{vals};
460 my $test= $op->{test};
461 return "( $test )" if !defined $cond;
467 if ( $ranges[-1][0] == $ranges[-1][1] ) {
468 $ranges[-1]= $ranges[-1][0];
469 } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
470 $ranges[-1]= $ranges[-1][0];
471 push @ranges, $ranges[-1] + 1;
475 for my $cond ( @$cond ) {
476 if ( !@ranges || $cond != $ranges[-1][1] + 1 ) {
478 push @ranges, [ $cond, $cond ];
484 return $self->_combine( $test, @ranges )
489 "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
491 : sprintf( "$self->{val_fmt} == $test", $_ );
493 return "( " . join( " || ", @ranges ) . " )";
497 # recursively turn a list of conditions into a fast break-out condition
498 # used by _cond_as_str() for 'cp' type macros.
500 my ( $self, $test, @cond )= @_;
502 my $item= shift @cond;
506 sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
508 $gtv= sprintf "$self->{val_fmt}", $item->[1];
510 $cstr= sprintf( "$self->{val_fmt} == $test", $item );
511 $gtv= sprintf "$self->{val_fmt}", $item;
514 return "( $cstr || ( $gtv < $test &&\n"
515 . $self->_combine( $test, @cond ) . " ) )";
522 # recursively convert an optree to text with reasonably neat formatting
524 my ( $self, $op, $combine, $brace )= @_;
528 my $cond= $self->_cond_as_str( $op, $combine );
529 my $yes= $self->_render( $op->{yes}, $combine, 1 );
530 my $no= $self->_render( $op->{no}, $combine, 0 );
531 return "( $cond )" if $yes eq '1' and $no eq '0';
532 my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
533 return "$lb$cond ? $yes : $no$rb"
534 if !ref( $op->{yes} ) && !ref( $op->{no} );
536 my $ind= "\n" . ( $ind1 x $op->{depth} );
538 if ( ref $op->{yes} ) {
539 $yes= $ind . $ind1 . $yes;
544 return "$lb$cond ?$yes$ind: $no$rb";
547 # $expr=render($op,$combine)
549 # convert an optree to text with reasonably neat formatting. If $combine
550 # is true then the condition is created using "fast breakouts" which
551 # produce uglier expressions that are more efficient for common case,
552 # longer lists such as that resulting from type 'cp' output.
553 # Currently only used for type 'cp' macros.
555 my ( $self, $op, $combine )= @_;
556 my $str= "( " . $self->_render( $op, $combine ) . " )";
557 return __clean( $str );
561 # make a macro of a given type.
562 # calls into make_trie and (generic_|length_)optree as needed
564 # type : 'cp','generic','low','latin1','utf8','LATIN1','UTF8'
565 # ret_type : 'cp' or 'len'
566 # safe : add length guards to macro
568 # type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
569 # in which case it defaults to 'cp' as well.
571 # it is illegal to do a type 'cp' macro on a pattern with multi-codepoint
572 # sequences in it, as the generated macro will accept only a single codepoint
581 my $type= $opts{type} || 'generic';
582 die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
584 and $self->{has_multi};
585 my $ret_type= $opts{ret_type} || ( $opts{type} eq 'cp' ? 'cp' : 'len' );
588 $method= 'length_optree';
589 } elsif ( $type eq 'generic' ) {
590 $method= 'generic_optree';
594 my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
595 my $text= $self->render( $optree, $type eq 'cp' );
596 my @args= $type eq 'cp' ? 'cp' : 's';
597 push @args, "e" if $opts{safe};
598 push @args, "is_utf8" if $type eq 'generic';
599 push @args, "len" if $ret_type eq 'both';
600 my $pfx= $ret_type eq 'both' ? 'what_len_' :
601 $ret_type eq 'cp' ? 'what_' : 'is_';
602 my $ext= $type eq 'generic' ? '' : '_' . lc( $type );
603 $ext .= "_safe" if $opts{safe};
604 my $argstr= join ",", @args;
605 return "/*** GENERATED CODE ***/\n"
606 . __macro( "#define $pfx$self->{op}$ext($argstr)\n$text" );
609 # if we arent being used as a module (highly likely) then process
610 # the __DATA__ below and produce macros in regcharclass.h
611 # if an argument is provided to the script then it is assumed to
612 # be the path of the file to output to, if the arg is '-' outputs
619 my $path= shift @ARGV;
622 $path= "regcharclass.h";
623 if ( !-e $path ) { $path= "../$path" }
624 if ( !-e $path ) { die "Can't find '$path' to update!\n" }
627 if ( $path eq '-' ) {
630 rename $path, "$path.bak";
631 open $out_fh, ">", $path
632 or die "Can't write to '$path':$!";
633 binmode $out_fh; # want unix line endings even when run on win32.
635 my ( $zero )= $0 =~ /([^\\\/]+)$/;
636 print $out_fh <<"HEADER";
637 /* -*- buffer-read-only: t -*-
641 * Copyright (C) 2007, by Larry Wall and others
643 * You may distribute under the terms of either the GNU General Public
644 * License or the Artistic License, as specified in the README file.
646 * !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
647 * This file is built by Porting/$zero.
649 * Any changes made here will be lost!
655 my ( $op, $title, @txt, @types, @mods );
658 print $out_fh "/*\n\t$op: $title\n\n";
659 print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
660 my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
662 #die Dumper(\@types,\@mods);
664 foreach my $type_spec ( @types ) {
665 my ( $type, $ret )= split /-/, $type_spec;
667 foreach my $mod ( @mods ) {
668 next if $mod eq 'safe' and $type eq 'cp';
669 my $macro= $obj->make_macro(
672 safe => $mod eq 'safe'
674 print $out_fh $macro, "\n";
685 ( $op, $title )= split /\s*:\s*/, $_, 2;
687 } elsif ( s/^=>// ) {
688 my ( $type, $modifier )= split /:/, $_;
689 @types= split ' ', $type;
690 @mods= split ' ', $modifier;
696 print $out_fh "/* ex: set ro: */\n";
697 print "updated $path\n" if $path ne '-';
701 # Valid types: generic, LATIN1, UTF8, low, latin1, utf8
702 # default return value is octects read.
703 # append -cp to make it codepoint matched.
704 # modifiers come after the colon, valid possibilities
705 # being 'fast' and 'safe'.
707 1; # in the unlikely case we are being used as a module
710 LNBREAK: Line Break: \R
711 => generic UTF8 LATIN1 :fast safe
712 "\x0D\x0A" # CRLF - Network (Windows) line ending
713 0x0A # LF | LINE FEED
714 0x0B # VT | VERTICAL TAB
715 0x0C # FF | FORM FEED
716 0x0D # CR | CARRIAGE RETURN
717 0x85 # NEL | NEXT LINE
718 0x2028 # LINE SEPARATOR
719 0x2029 # PARAGRAPH SEPARATOR
721 HORIZWS: Horizontal Whitespace: \h \H
722 => generic UTF8 LATIN1 cp :fast safe
726 0x1680 # OGHAM SPACE MARK
727 0x180e # MONGOLIAN VOWEL SEPARATOR
732 0x2004 # THREE-PER-EM SPACE
733 0x2005 # FOUR-PER-EM SPACE
734 0x2006 # SIX-PER-EM SPACE
735 0x2007 # FIGURE SPACE
736 0x2008 # PUNCTUATION SPACE
739 0x202f # NARROW NO-BREAK SPACE
740 0x205f # MEDIUM MATHEMATICAL SPACE
741 0x3000 # IDEOGRAPHIC SPACE
743 VERTWS: Vertical Whitespace: \v \V
744 => generic UTF8 LATIN1 cp :fast safe
750 0x2028 # LINE SEPARATOR
751 0x2029 # PARAGRAPH SEPARATOR
754 TRICKYFOLD: Problematic fold case letters.
755 => generic cp generic-cp generic-both :fast safe
756 0x00DF # LATIN1 SMALL LETTER SHARP S
757 0x0390 # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
758 0x03B0 # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS