use warnings;
use Carp;
-our $VERSION = '0.14';
+our $VERSION = '0.16';
our $PACKAGE = __PACKAGE__;
require Exporter;
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
normalize decompose reorder compose
- getCanon getCompat getComposite getCombinClass isExclusion
+ checkNFD checkNFKD checkNFC checkNFKC check
+ getCanon getCompat getComposite getCombinClass
+ isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
+ isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
+);
+our %EXPORT_TAGS = (
+ all => [ @EXPORT, @EXPORT_OK ],
+ normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
+ check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
);
-our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
bootstrap Unicode::Normalize $VERSION;
sub NFD ($) { reorder(decompose($_[0])) }
sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-
sub NFC ($) { compose(reorder(decompose($_[0]))) }
sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
croak $PACKAGE."::normalize: invalid form name: $form";
}
+sub check($$)
+{
+ my $form = shift;
+ $form =~ s/^NF//;
+ return
+ $form eq 'D' ? checkNFD ($_[0]) :
+ $form eq 'C' ? checkNFC ($_[0]) :
+ $form eq 'KD' ? checkNFKD($_[0]) :
+ $form eq 'KC' ? checkNFKC($_[0]) :
+ croak $PACKAGE."::check: invalid form name: $form";
+}
+
1;
__END__
use Unicode::Normalize;
- $string_NFD = NFD($raw_string); # Normalization Form D
- $string_NFC = NFC($raw_string); # Normalization Form C
- $string_NFKD = NFKD($raw_string); # Normalization Form KD
- $string_NFKC = NFKC($raw_string); # Normalization Form KC
+ $NFD_string = NFD($string); # Normalization Form D
+ $NFC_string = NFC($string); # Normalization Form C
+ $NFKD_string = NFKD($string); # Normalization Form KD
+ $NFKC_string = NFKC($string); # Normalization Form KC
or
use Unicode::Normalize 'normalize';
- $string_NFD = normalize('D', $raw_string); # Normalization Form D
- $string_NFC = normalize('C', $raw_string); # Normalization Form C
- $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
- $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
+ $NFD_string = normalize('D', $string); # Normalization Form D
+ $NFC_string = normalize('C', $string); # Normalization Form C
+ $NFKD_string = normalize('KD', $string); # Normalization Form KD
+ $NFKC_string = normalize('KC', $string); # Normalization Form KC
=head1 DESCRIPTION
=over 4
-=item C<$string_NFD = NFD($raw_string)>
+=item C<$NFD_string = NFD($string)>
returns the Normalization Form D (formed by canonical decomposition).
-
-=item C<$string_NFC = NFC($raw_string)>
+=item C<$NFC_string = NFC($string)>
returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).
-=item C<$string_NFKD = NFKD($raw_string)>
+=item C<$NFKD_string = NFKD($string)>
returns the Normalization Form KD (formed by compatibility decomposition).
-=item C<$string_NFKC = NFKC($raw_string)>
+=item C<$NFKC_string = NFKC($string)>
returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).
-=item C<$normalized_string = normalize($form_name, $raw_string)>
+=item C<$normalized_string = normalize($form_name, $string)>
As C<$form_name>, one of the following names must be given.
=back
+=head2 Decomposition and Composition
+
+=over 4
+
+=item C<$decomposed_string = decompose($string)>
+
+=item C<$decomposed_string = decompose($string, $useCompatMapping)>
+
+Decompose the specified string and returns the result.
+
+If the second parameter (a boolean) is omitted or false, decomposes it
+using the Canonical Decomposition Mapping.
+If true, decomposes it using the Compatibility Decomposition Mapping.
+
+The string returned is not always in NFD/NFKD.
+Reordering may be required.
+
+ $NFD_string = reorder(decompose($string)); # eq. to NFD()
+ $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
+
+=item C<$reordered_string = reorder($string)>
+
+Reorder the combining characters and the like in the canonical ordering
+and returns the result.
+
+E.g., when you have a list of NFD/NFKD strings,
+you can get the concatenated NFD/NFKD string from them, saying
+
+ $concat_NFD = reorder(join '', @NFD_strings);
+ $concat_NFKD = reorder(join '', @NFKD_strings);
+
+=item C<$composed_string = compose($string)>
+
+Returns the string where composable pairs are composed.
+
+E.g., when you have a NFD/NFKD string,
+you can get its NFC/NFKC string, saying
+
+ $NFC_string = compose($NFD_string);
+ $NFKC_string = compose($NFKD_string);
+
+=back
+
+=head2 Quick Check
+
+(see Annex 8, UAX #15; F<DerivedNormalizationProperties.txt>)
+
+The following functions check whether the string is in that normalization form.
+
+The result returned will be:
+
+ YES The string is in that normalization form.
+ NO The string is not in that normalization form.
+ MAYBE Dubious. Maybe yes, maybe no.
+
+=over 4
+
+=item C<$result = checkNFD($string)>
+
+returns YES (1) or NO (empty string).
+
+=item C<$result = checkNFC($string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+=item C<$result = checkNFKD($string)>
+
+returns YES (1) or NO (empty string).
+
+=item C<$result = checkNFKC($string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+=item C<$result = check($form_name, $string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+C<$form_name> is alike to that for C<normalize()>.
+
+=back
+
+B<Note>
+
+In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
+The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
+
+A MAYBE-NFC/NFKC string should contain at least
+one combining character or the like.
+For example, C<COMBINING ACUTE ACCENT> has
+the MAYBE_NFC/MAYBE_NFKC property.
+Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
+and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
+Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
+(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
+while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
+
+If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
+
+ $string eq NFC($string) # more thorough than checkNFC($string)
+ $string eq NFKC($string) # more thorough than checkNFKC($string)
+
=head2 Character Data
These functions are interface of character data used internally.
=item C<$canonical_decomposed = getCanon($codepoint)>
+If the character of the specified codepoint is canonically
+decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string canonically equivalent to it.
+
+If it is not decomposable, returns undef.
+
=item C<$compatibility_decomposed = getCompat($codepoint)>
-If the character of the specified codepoint is canonically or
-compatibility decomposable (including Hangul Syllables),
-returns the B<completely decomposed> string equivalent to it.
+If the character of the specified codepoint is compatibility
+decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string compatibility equivalent to it.
If it is not decomposable, returns undef.
-=item C<$uv_composite = getComposite($uv_here, $uv_next)>
+=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
If two characters here and next (as codepoints) are composable
-(including Hangul Jamo/Syllables and Exclusions),
+(including Hangul Jamo/Syllables and Composition Exclusions),
returns the codepoint of the composite.
If they are not composable, returns undef.
=item C<$combining_class = getCombinClass($codepoint)>
-Returns the combining class as integer of the character.
+Returns the combining class of the character as an integer.
=item C<$is_exclusion = isExclusion($codepoint)>
+Returns a boolean whether the character of the specified codepoint
+is a composition exclusion.
+
+=item C<$is_singleton = isSingleton($codepoint)>
+
Returns a boolean whether the character of the specified codepoint is
-a composition exclusion.
+a singleton.
+
+=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
+
+Returns a boolean whether the canonical decomposition
+of the character of the specified codepoint
+is a Non-Starter Decomposition.
+
+=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
+
+Returns a boolean whether the character of the specified codepoint
+may be composed with the previous one in a certain composition
+(including Hangul Compositions, but excluding
+Composition Exclusions and Non-Starter Decompositions).
=back
C<normalize> and other some functions: on request.
-=head2 TODO
-
-Unicode::Normalize has not been ported to EBCDIC. The code mostly
-would work just fine but a decision needs to be made: how the module
-should work in EBCDIC? Should the low 256 characters be understood as
-Unicode or as EBCDIC code points? Should one be chosen or should
-there be a way to do either? Or should such translation be left
-outside the module for the user to do, for example by using
-Encode::from_to()?
-
=head1 AUTHOR
SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
Unicode Normalization Forms - UAX #15
+=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProperties.txt
+
+Derived Normalization Properties
+
=back
=cut