package Unicode::UCD;
use strict;
use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
our $VERSION = '0.75';
sub DEBUG () { 0 }
$|=1 if DEBUG;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(charinfo
charblock charscript
charblocks charscripts
charinrange
charprop
charprops_all
general_categories bidi_types
compexcl
casefold all_casefolds casespec
namedseq
num
prop_aliases
prop_value_aliases
prop_values
prop_invlist
prop_invmap
search_invlist
MAX_CP
);
use Carp;
sub IS_ASCII_PLATFORM { ord("A") == 65 }
=head1 NAME
Unicode::UCD - Unicode character database
=head1 SYNOPSIS
use Unicode::UCD 'charinfo';
my $charinfo = charinfo($codepoint);
use Unicode::UCD 'charprop';
my $value = charprop($codepoint, $property);
use Unicode::UCD 'charprops_all';
my $all_values_hash_ref = charprops_all($codepoint);
use Unicode::UCD 'casefold';
my $casefold = casefold($codepoint);
use Unicode::UCD 'all_casefolds';
my $all_casefolds_ref = all_casefolds();
use Unicode::UCD 'casespec';
my $casespec = casespec($codepoint);
use Unicode::UCD 'charblock';
my $charblock = charblock($codepoint);
use Unicode::UCD 'charscript';
my $charscript = charscript($codepoint);
use Unicode::UCD 'charblocks';
my $charblocks = charblocks();
use Unicode::UCD 'charscripts';
my $charscripts = charscripts();
use Unicode::UCD qw(charscript charinrange);
my $range = charscript($script);
print "looks like $script\n" if charinrange($range, $codepoint);
use Unicode::UCD qw(general_categories bidi_types);
my $categories = general_categories();
my $types = bidi_types();
use Unicode::UCD 'prop_aliases';
my @space_names = prop_aliases("space");
use Unicode::UCD 'prop_value_aliases';
my @gc_punct_names = prop_value_aliases("Gc", "Punct");
use Unicode::UCD 'prop_values';
my @all_EA_short_names = prop_values("East_Asian_Width");
use Unicode::UCD 'prop_invlist';
my @puncts = prop_invlist("gc=punctuation");
use Unicode::UCD 'prop_invmap';
my ($list_ref, $map_ref, $format, $missing)
= prop_invmap("General Category");
use Unicode::UCD 'search_invlist';
my $index = search_invlist(\@invlist, $code_point);
# The following function should be used only internally in
# implementations of the Unicode Normalization Algorithm, and there
# are better choices than it.
use Unicode::UCD 'compexcl';
my $compexcl = compexcl($codepoint);
use Unicode::UCD 'namedseq';
my $namedseq = namedseq($named_sequence_name);
my $unicode_version = Unicode::UCD::UnicodeVersion();
my $convert_to_numeric =
Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
=head1 DESCRIPTION
The Unicode::UCD module offers a series of functions that
provide a simple interface to the Unicode
Character Database.
=head2 code point argument
Some of the functions are called with a I, which is either
a decimal or a hexadecimal scalar designating a code point in the platform's
native character set (extended to Unicode), or a string containing C
followed by hexadecimals
designating a Unicode code point. A leading 0 will force a hexadecimal
interpretation, as will a hexadecimal digit that isn't a decimal digit.
Examples:
223 # Decimal 223 in native character set
0223 # Hexadecimal 223, native (= 547 decimal)
0xDF # Hexadecimal DF, native (= 223 decimal)
'0xDF' # String form of hexadecimal (= 223 decimal)
'U+DF' # Hexadecimal DF, in Unicode's character set
(= LATIN SMALL LETTER SHARP S)
Note that the largest code point in Unicode is U+10FFFF.
=cut
our %caseless_equivalent;
our $e_precision;
our %file_to_swash_name;
our @inline_definitions;
our %loose_property_name_of;
our %loose_property_to_file_of;
our %loose_to_file_of;
our $MAX_CP;
our %nv_floating_to_rational;
our %prop_aliases;
our %stricter_to_file_of;
our %strict_property_to_file_of;
our %SwashInfo;
our %why_deprecated;
my $v_unicode_version; # v-string.
sub openunicode {
my (@path) = @_;
my $rfh;
for my $d (@INC) {
use File::Spec;
my $f = File::Spec->catfile($d, "unicore", @path);
return $rfh if open($rfh, '<', $f);
}
croak __PACKAGE__, ": failed to find ",
File::Spec->catfile("unicore", @path), " in @INC";
}
sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
return dclone(shift) if defined &dclone;
my $arg = shift;
my $type = ref $arg;
return $arg unless $type; # No deep cloning needed for scalars
if ($type eq 'ARRAY') {
my @return;
foreach my $element (@$arg) {
push @return, &_dclone($element);
}
return \@return;
}
elsif ($type eq 'HASH') {
my %return;
foreach my $key (keys %$arg) {
$return{$key} = &_dclone($arg->{$key});
}
return \%return;
}
else {
croak "_dclone can't handle " . $type;
}
}
=head2 B
as a reference to a hash of fields as defined by the Unicode
standard. If the L is not assigned in the standard
(i.e., has the general category C
the input native L expressed in hexadecimal, with
leading zeros
added if necessary to make it contain at least four hexdigits
=item B, all IN UPPER CASE.
Some control-type code points do not have names.
This field will be empty for C.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the category name.
=item B used in the Canonical Ordering Algorithm.
For Unicode 5.1, this is described in Section 3.11 C.
This will match one of the keys in the hash returned by L.
The L function can be used to get all the synonyms
of the bidi type name.
=item B has no decomposition; or is one or more codes
(separated by spaces) that, taken in order, represent a decomposition for
I. Each has at least four hexdigits.
The codes may be preceded by a word enclosed in angle brackets, then a space,
like C represents a decimal digit this is its integer numeric value
=item B represents some other digit-like number, this is its integer
numeric value
=item B represents a whole or rational number, this is its numeric value.
Rational values are expressed as a string like C<1/4>.
=item B is mirrored in bidirectional text
=item B in the Unicode 1.0 standard if one
existed for this code point and is different from the current name
=item B expressed as at least four
hexdigits. This indicates that the full uppercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple uppercase mapping is
I itself; you'll need some other means, (like L or
L to get the full mapping.
=item B expressed as at least four
hexdigits. This indicates that the full lowercase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple lowercase mapping is
I itself; you'll need some other means, (like L or
L to get the full mapping.
=item B expressed as at least four
hexdigits. This indicates that the full titlecase mapping is a single
character, and is identical to the simple (single-character only) mapping.
When this field is empty, it means that the simple titlecase mapping is
I itself; you'll need some other means, (like L or
L to get the full mapping.
=item B belongs to (used in C<\p{Blk=...}>).
The L function can be used to get all the synonyms
of the block name.
See L.
=item B