1 # $Id: EncodingDetect.pm,v 1.6 2007-02-07 09:33:50 grant Exp $
3 package XML::SAX::PurePerl; # NB, not ::EncodingDetect!
8 my ($parser, $reader) = @_;
10 my $error = "Invalid byte sequence at start of file";
12 my $data = $reader->data;
13 if ($data =~ /^\x00\x00\xFE\xFF/) {
15 $reader->move_along(4);
16 $reader->set_encoding('UCS-4BE');
19 elsif ($data =~ /^\x00\x00\xFF\xFE/) {
21 $reader->move_along(4);
22 $reader->set_encoding('UCS-4-2143');
25 elsif ($data =~ /^\x00\x00\x00\x3C/) {
26 $reader->set_encoding('UCS-4BE');
29 elsif ($data =~ /^\x00\x00\x3C\x00/) {
30 $reader->set_encoding('UCS-4-2143');
33 elsif ($data =~ /^\x00\x3C\x00\x00/) {
34 $reader->set_encoding('UCS-4-3412');
37 elsif ($data =~ /^\x00\x3C\x00\x3F/) {
38 $reader->set_encoding('UTF-16BE');
41 elsif ($data =~ /^\xFF\xFE\x00\x00/) {
43 $reader->move_along(4);
44 $reader->set_encoding('UCS-4LE');
47 elsif ($data =~ /^\xFF\xFE/) {
48 $reader->move_along(2);
49 $reader->set_encoding('UTF-16LE');
52 elsif ($data =~ /^\xFE\xFF\x00\x00/) {
53 $reader->move_along(4);
54 $reader->set_encoding('UCS-4-3412');
57 elsif ($data =~ /^\xFE\xFF/) {
58 $reader->move_along(2);
59 $reader->set_encoding('UTF-16BE');
62 elsif ($data =~ /^\xEF\xBB\xBF/) { # UTF-8 BOM
63 $reader->move_along(3);
64 $reader->set_encoding('UTF-8');
67 elsif ($data =~ /^\x3C\x00\x00\x00/) {
68 $reader->set_encoding('UCS-4LE');
71 elsif ($data =~ /^\x3C\x00\x3F\x00/) {
72 $reader->set_encoding('UTF-16LE');
75 elsif ($data =~ /^\x3C\x3F\x78\x6D/) {
76 # $reader->set_encoding('UTF-8');
79 elsif ($data =~ /^\x3C\x3F\x78/) {
80 # $reader->set_encoding('UTF-8');
83 elsif ($data =~ /^\x3C\x3F/) {
84 # $reader->set_encoding('UTF-8');
87 elsif ($data =~ /^\x3C/) {
88 # $reader->set_encoding('UTF-8');
91 elsif ($data =~ /^[\x20\x09\x0A\x0D]+\x3C[^\x3F]/) {
92 # $reader->set_encoding('UTF-8');
95 elsif ($data =~ /^\x4C\x6F\xA7\x94/) {
96 $reader->set_encoding('EBCDIC');
100 warn("Unable to recognise encoding of this document");