[p5sagit/p5-mst-13.2.git] / ext / Encode / t / gsm0338.t

BEGIN {
    if ($ENV{'PERL_CORE'}){
        chdir 't';
        unshift @INC, '../lib';
    }
    require Config; import Config;
    if ($Config{'extensions'} !~ /\bEncode\b/) {
      print "1..0 # Skip: Encode was not built\n";
      exit 0;
    }
    $| = 1;
}

use strict;
use utf8;
use Test::More tests => 778;
use Encode;
use Encode::GSM0338;

# The specification of GSM 03.38 is not awfully clear.
# (http://www.unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT)
# The various combinations of 0x00 and 0x1B as leading bytes
# are unclear, as is the semantics of those bytes as standalone
# or as final single bytes.


my $chk = Encode::LEAVE_SRC();

# escapes
# see http://www.csoft.co.uk/sms/character_sets/gsm.htm
my %esc_seq = (
	       "\x{20ac}" => "\x1b\x65",
	       "\x0c"     => "\x1b\x0A",
	       "["        => "\x1b\x3C",
	       "\\"       => "\x1b\x2F",
	       "]"        => "\x1b\x3E",
	       "^"        => "\x1b\x14",
	       "{"        => "\x1b\x28",
	       "|"        => "\x1b\x40",
	       "}"        => "\x1b\x29",
	       "~"        => "\x1b\x3D",
);

my %unesc_seq = reverse %esc_seq;


sub eu{
    $_[0] =~ /[\x00-\x1f]/ ? 
	sprintf("\\x{%04X}", ord($_[0])) : encode_utf8($_[0]);
 
}

for my $c ( map { chr } 0 .. 127 ) {
    my $u = $Encode::GSM0338::GSM2UNI{$c};

    # default character set
    is decode( "gsm0338", $c, $chk ), $u,
      sprintf( "decode \\x%02X", ord($c) );
    eval { decode( "gsm0338", $c . "\xff", $chk ) };
    ok( $@, $@ );
    is encode( "gsm0338", $u, $chk ), $c, sprintf( "encode %s", eu($u) );
    eval { encode( "gsm0338", $u . "\x{3000}", $chk ) };
    ok( $@, $@ );

    # nasty atmark
    if ( $c eq "\x00" ) {
        is decode( "gsm0338", "\x00" . $c, $chk ), "\x00",
          sprintf( '@@ =>: \x00+\x%02X', ord($c) );
    }
    else {
        is decode( "gsm0338", "\x00" . $c ), '@' . decode( "gsm0338", $c ),
          sprintf( '@: decode \x00+\x%02X', ord($c) );
    }

    # escape seq.
    my $ecs = "\x1b" . $c;
    if ( $unesc_seq{$ecs} ) {
        is decode( "gsm0338", $ecs, $chk ), $unesc_seq{$ecs},
          sprintf( "ESC: decode ESC+\\x%02X", ord($c) );
        is encode( "gsm0338", $unesc_seq{$ecs}, $chk ), $ecs,
          sprintf( "ESC: encode %s ", eu( $unesc_seq{$ecs} ) );
    }
    else {
        is decode( "gsm0338", $ecs, $chk ),
          "\xA0" . decode( "gsm0338", $c ),
          sprintf( "decode ESC+\\x%02X", ord($c) );
    }
}

__END__
for my $c (map { chr } 0..127){
    my $b = "\x1b$c";
    my $u =  $Encode::GSM0338::GSM2UNI{$b};
    next unless $u;
    $u ||= "\xA0" . $Encode::GSM0338::GSM2UNI{$c};
    is decode("gsm0338", $b), $u, sprintf("decode ESC+\\x%02X", ord($c) );
}

__END__
# old test follows
ub t { is(decode("gsm0338", my $t = $_[0]), $_[1]) }

# t("\x00",     "\x00"); # ???

# "Round-trip".
t("\x41",     "\x41");

t("\x01",     "\xA3");
t("\x02",     "\x24");
t("\x03",     "\xA5");
t("\x09",     "\xE7");

t("\x00\x00", "\x00\x00"); # Maybe?
t("\x00\x1B", "\x40\xA0"); # Maybe?
t("\x00\x41", "\x40\x41");

# t("\x1B",     "\x1B"); # ???

# Escape with no special second byte is just a NBSP.
t("\x1B\x41", "\xA0\x41");

t("\x1B\x00", "\xA0\x40"); # Maybe?

# Special escape characters.
t("\x1B\x0A", "\x0C");
t("\x1B\x14", "\x5E");
t("\x1B\x28", "\x7B");
t("\x1B\x29", "\x7D");
t("\x1B\x2F", "\x5C");
t("\x1B\x3C", "\x5B");
t("\x1B\x3D", "\x7E");
t("\x1B\x3E", "\x5D");
t("\x1B\x40", "\x7C");
t("\x1B\x40", "\x7C");
t("\x1B\x65", "\x{20AC}");
Commit	Line	Data
e74d7437	1	BEGIN {
	2	if ($ENV{'PERL_CORE'}){
	3	chdir 't';
	4	unshift @INC, '../lib';
	5	}
	6	require Config; import Config;
	7	if ($Config{'extensions'} !~ /\bEncode\b/) {
	8	print "1..0 # Skip: Encode was not built\n";
	9	exit 0;
	10	}
	11	$\| = 1;
	12	}
	13
	14	use strict;
44b3b9c7	15	use utf8;
44b3b9c7	16	use Test::More tests => 778;
e74d7437	17	use Encode;
44b3b9c7	18	use Encode::GSM0338;
e74d7437	19
	20	# The specification of GSM 03.38 is not awfully clear.
	21	# (http://www.unicode.org/Public/MAPPINGS/ETSI/GSM0338.TXT)
	22	# The various combinations of 0x00 and 0x1B as leading bytes
	23	# are unclear, as is the semantics of those bytes as standalone
	24	# or as final single bytes.
	25
44b3b9c7	26
	27	my $chk = Encode::LEAVE_SRC();
	28
	29	# escapes
	30	# see http://www.csoft.co.uk/sms/character_sets/gsm.htm
	31	my %esc_seq = (
	32	"\x{20ac}" => "\x1b\x65",
	33	"\x0c" => "\x1b\x0A",
	34	"[" => "\x1b\x3C",
	35	"\\" => "\x1b\x2F",
	36	"]" => "\x1b\x3E",
	37	"^" => "\x1b\x14",
	38	"{" => "\x1b\x28",
	39	"\|" => "\x1b\x40",
	40	"}" => "\x1b\x29",
	41	"~" => "\x1b\x3D",
	42	);
	43
	44	my %unesc_seq = reverse %esc_seq;
	45
	46
	47	sub eu{
	48	$_[0] =~ /[\x00-\x1f]/ ?
	49	sprintf("\\x{%04X}", ord($_[0])) : encode_utf8($_[0]);
	50
	51	}
	52
	53	for my $c ( map { chr } 0 .. 127 ) {
	54	my $u = $Encode::GSM0338::GSM2UNI{$c};
	55
	56	# default character set
	57	is decode( "gsm0338", $c, $chk ), $u,
	58	sprintf( "decode \\x%02X", ord($c) );
	59	eval { decode( "gsm0338", $c . "\xff", $chk ) };
	60	ok( $@, $@ );
	61	is encode( "gsm0338", $u, $chk ), $c, sprintf( "encode %s", eu($u) );
	62	eval { encode( "gsm0338", $u . "\x{3000}", $chk ) };
	63	ok( $@, $@ );
	64
	65	# nasty atmark
	66	if ( $c eq "\x00" ) {
	67	is decode( "gsm0338", "\x00" . $c, $chk ), "\x00",
	68	sprintf( '@@ =>: \x00+\x%02X', ord($c) );
	69	}
	70	else {
	71	is decode( "gsm0338", "\x00" . $c ), '@' . decode( "gsm0338", $c ),
	72	sprintf( '@: decode \x00+\x%02X', ord($c) );
	73	}
	74
	75	# escape seq.
	76	my $ecs = "\x1b" . $c;
	77	if ( $unesc_seq{$ecs} ) {
	78	is decode( "gsm0338", $ecs, $chk ), $unesc_seq{$ecs},
	79	sprintf( "ESC: decode ESC+\\x%02X", ord($c) );
	80	is encode( "gsm0338", $unesc_seq{$ecs}, $chk ), $ecs,
	81	sprintf( "ESC: encode %s ", eu( $unesc_seq{$ecs} ) );
	82	}
	83	else {
	84	is decode( "gsm0338", $ecs, $chk ),
	85	"\xA0" . decode( "gsm0338", $c ),
	86	sprintf( "decode ESC+\\x%02X", ord($c) );
	87	}
	88	}
	89
90	__END__
91	for my $c (map { chr } 0..127){
92	my $b = "\x1b$c";
93	my $u = $Encode::GSM0338::GSM2UNI{$b};
94	next unless $u;
95	$u \|\|= "\xA0" . $Encode::GSM0338::GSM2UNI{$c};
96	is decode("gsm0338", $b), $u, sprintf("decode ESC+\\x%02X", ord($c) );
97	}
98
99	__END__
100	# old test follows
101	ub t { is(decode("gsm0338", my $t = $_[0]), $_[1]) }
e74d7437	102
	103	# t("\x00", "\x00"); # ???
	104
	105	# "Round-trip".
	106	t("\x41", "\x41");
	107
	108	t("\x01", "\xA3");
	109	t("\x02", "\x24");
	110	t("\x03", "\xA5");
	111	t("\x09", "\xE7");
	112
	113	t("\x00\x00", "\x00\x00"); # Maybe?
	114	t("\x00\x1B", "\x40\xA0"); # Maybe?
	115	t("\x00\x41", "\x40\x41");
	116
	117	# t("\x1B", "\x1B"); # ???
	118
	119	# Escape with no special second byte is just a NBSP.
	120	t("\x1B\x41", "\xA0\x41");
	121
	122	t("\x1B\x00", "\xA0\x40"); # Maybe?
	123
	124	# Special escape characters.
	125	t("\x1B\x0A", "\x0C");
	126	t("\x1B\x14", "\x5E");
	127	t("\x1B\x28", "\x7B");
	128	t("\x1B\x29", "\x7D");
	129	t("\x1B\x2F", "\x5C");
	130	t("\x1B\x3C", "\x5B");
	131	t("\x1B\x3D", "\x7E");
	132	t("\x1B\x3E", "\x5D");
	133	t("\x1B\x40", "\x7C");
	134	t("\x1B\x40", "\x7C");
	135	t("\x1B\x65", "\x{20AC}");