lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Moose::Util::TypeConstraints;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9 use XML::Easy::Syntax qw( $xml10_name_rx );
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Witness - a manuscript witness to a text tradition
  14
  15 =head1 SYNOPSIS
  16
  17   use Text::Tradition::Witness;
  18   my $w = Text::Tradition::Witness->new(
  19     'sigil' => 'A',
  20     'identifier' => 'Oxford MS Ex.1932',
  21     );
  22
  23 =head1 DESCRIPTION
  24
  25 Text::Tradition::Witness is an object representation of a manuscript
  26 witness to a text tradition.  A manuscript has a sigil (a short code that
  27 represents it in the wider tradition), an identifier (e.g. the library ID),
  28 and probably a text.
  29
  30 =head1 METHODS
  31
  32 =head2 new
  33
  34 Create a new witness.  Options include:
  35
  36 =over
  37
  38 =item * sigil - A short code to represent the manuscript.  Required.
  39
  40 =item * sourcetype - What sort of witness data this is. Options are
  41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  42 used by Collation parsers.)
  43
  44 =item * file
  45 =item * string
  46 =item * object
  47
  48 The data source for the witness.  Use the appropriate option.
  49
  50 =item * use_text - An initialization option.  If the witness is read from a
  51 TEI document and more than one <text/> tag exists therein, the default
  52 behavior is to use the first defined text.  If this is not desired,
  53 use_text should be set to an XPath expression that will select the correct
  54 text.
  55
  56 =item * language - The name of the applicable L<Text::Tradition::Lang>
  57 module for language handling. Usually inherited from the language set in
  58 the L<Text::Tradition> object, and defaults to Default.
  59
  60 =item * identifier - The recognized name of the manuscript, e.g. a library
  61 identifier. Taken from the msDesc element for a TEI file.
  62
  63 =item * other_info - A freeform string for any other description of the
  64 manuscript.
  65
  66 =back
  67
  68 =head2 sigil
  69
  70 The sigil by which to identify this manuscript, which must conform to the
  71 specification for XML attribute strings (broadly speaking, it must begin
  72 with a letter and can have only a few sorts of punctuation characters in
  73 it.)
  74
  75 =head2 identifier
  76
  77 A freeform name by which to identify the manuscript, which may be longer
  78 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  79 TEI msName attribute, or constructed from the settlement and idno if
  80 supplied.
  81
  82 =head2 settlement
  83
  84 The city, town, etc. where the manuscript is held. Will be read from the
  85 TEI msDesc element if supplied.
  86
  87 =head2 repository
  88
  89 The institution that holds the manuscript. Will be read from the TEI msDesc
  90 element if supplied.
  91
  92 =head2 idno
  93
  94 The identification or call number of the manuscript.  Will be read from the
  95 TEI msDesc element if supplied.
  96
  97 =head2 text
  98
  99 An array of strings (words) that contains the text of the
 100 manuscript.  This should not change after the witness has been
 101 instantiated, and the path through the collation should always match it.
 102
 103 =head2 layertext
 104
 105 An array of strings (words) that contains the layered
 106 text, if any, of the manuscript.  This should not change after the witness
 107 has been instantiated, and the path through the collation should always
 108 match it.
 109
 110 =head2 language
 111
 112 Accessor method to get the witness language.
 113
 114 =head2 identifier
 115
 116 Accessor method for the witness identifier.
 117
 118 =head2 other_info
 119
 120 Accessor method for the general witness description.
 121
 122 =head2 is_layered
 123
 124 Boolean method to note whether the witness has layers (e.g. pre-correction
 125 readings) in the collation.
 126
 127 =begin testing
 128
 129 use Text::Tradition;
 130 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 131 my $c = $trad->collation;
 132
 133 # Test a plaintext witness via string
 134 my $str = 'This is a line of text';
 135 my $ptwit = $trad->add_witness(
 136     'sigil' => 'A',
 137     'sourcetype' => 'plaintext',
 138     'string' => $str
 139      );
 140 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 141 if( $ptwit ) {
 142     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 143     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 144 }
 145
 146 # Test some JSON witnesses via object
 147 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 148 binmode( JSIN, ':encoding(UTF-8)' );
 149 my @lines = <JSIN>;
 150 close JSIN;
 151 $trad->add_json_witnesses( join( '', @lines ) );
 152 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 153         "Found first JSON witness" );
 154 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 155         "Found second JSON witness" );
 156
 157 # # Test an XML witness via file
 158 # my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 159 #       'file' => 't/data/witnesses/teiwit.xml' );
 160 # is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 161 # if( $xmlwit ) {
 162 #       is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 163 #       ok( $xmlwit->is_layered, "Picked up correction layer" );
 164 #       is( @{$xmlwit->path}, 185, "Got correct text length" );
 165 #       is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" );
 166 # }
 167
 168 ## Test use_text
 169
 170 =end testing
 171
 172 =cut
 173
 174 subtype 'SourceType',
 175         as 'Str',
 176         where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
 177         message { 'Source type must be one of xmldesc, plaintext, json, collation' };
 178
 179 subtype 'Sigil',
 180         as 'Str',
 181         where { $_ =~ /\A$xml10_name_rx\z/ },
 182         message { 'Sigil must be a valid XML attribute string' };
 183
 184 no Moose::Util::TypeConstraints;
 185
 186 has 'tradition' => (
 187         'is' => 'ro',
 188         'isa' => 'Text::Tradition',
 189         'required' => 1,
 190         );
 191
 192 # Sigil. Required identifier for a witness, but may be found inside
 193 # the XML file.
 194 has 'sigil' => (
 195         is => 'ro',
 196         isa => 'Sigil',
 197         predicate => 'has_sigil',
 198         writer => '_set_sigil',
 199         );
 200
 201 # Other identifying information
 202 has 'identifier' => (
 203         is => 'rw',
 204         isa => 'Str',
 205         );
 206
 207 has 'settlement' => (
 208         is => 'rw',
 209         isa => 'Str',
 210         );
 211
 212 has 'repository' => (
 213         is => 'rw',
 214         isa => 'Str',
 215         );
 216
 217 has 'idno' => (
 218         is => 'rw',
 219         isa => 'Str',
 220         );
 221
 222 has 'sourcetype' => (
 223         is => 'ro',
 224         isa => 'SourceType',
 225         required => 1,
 226 );
 227
 228 has 'language' => (
 229     is => 'ro',
 230     isa => 'Str',
 231     default => 'Default',
 232     );
 233
 234 # Source. Can be XML obj, JSON data struct, or string.
 235 # Not used if the witness is created by parsing a collation.
 236 has 'file' => (
 237         is => 'ro',
 238         isa => 'Str',
 239         predicate => 'has_file',
 240 );
 241
 242 has 'string' => (
 243         is => 'ro',
 244         isa => 'Str',
 245         predicate => 'has_string',
 246 );
 247
 248 has 'object' => ( # could be anything.
 249         is => 'ro',
 250         predicate => 'has_object',
 251         clearer => 'clear_object',
 252 );
 253
 254 # In the case of a TEI document with multiple texts, specify
 255 # which text is the root. Should be an XPath expression.
 256 has 'use_text' => (
 257         is => 'ro',
 258         isa => 'Str',
 259         );
 260
 261 has 'msdesc' => (  # if we started with a TEI doc
 262         is => 'ro',
 263         isa => 'XML::LibXML::Element',
 264         predicate => 'has_msdesc',
 265         writer => '_save_msdesc',
 266         );
 267
 268 # Text.  This is an array of strings (i.e. word tokens).
 269 # TODO Think about how to handle this for the case of pre-prepared
 270 # collations, where the tokens are in the graph already.
 271 has 'text' => (
 272         is => 'rw',
 273         isa => 'ArrayRef[Str]',
 274         predicate => 'has_text',
 275         );
 276
 277 has 'layertext' => (
 278         is => 'rw',
 279         isa => 'ArrayRef[Str]',
 280         predicate => 'has_layertext',
 281         );
 282
 283 # Path.  This is an array of Reading nodes that can be saved during
 284 # initialization, but should be cleared before saving in a DB.
 285 has 'path' => (
 286         is => 'rw',
 287         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 288         predicate => 'has_path',
 289         clearer => 'clear_path',
 290         );
 291
 292 has 'uncorrected_path' => (
 293         is => 'rw',
 294         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 295         clearer => 'clear_uncorrected_path',
 296         );
 297
 298 has 'is_layered' => (
 299         is => 'rw',
 300         isa => 'Bool',
 301         );
 302
 303 # If we set an uncorrected path, ever, remember that we did so.
 304 around 'uncorrected_path' => sub {
 305         my $orig = shift;
 306         my $self = shift;
 307
 308         $self->is_layered( 1 );
 309         $self->$orig( @_ );
 310 };
 311
 312 sub BUILD {
 313         my $self = shift;
 314         if( $self->has_source ) {
 315                 my $init_sub = '_init_from_' . $self->sourcetype;
 316                 $self->$init_sub();
 317                 # Remove our XML / source objects; we no longer need them.
 318                 $self->clear_object if $self->has_object;
 319                 $self->tradition->collation->make_witness_path( $self );
 320         }
 321         return $self;
 322 }
 323
 324 sub has_source {
 325         my $self = shift;
 326         return $self->has_file || $self->has_string || $self->has_object;
 327 }
 328
 329 sub _init_from_xmldesc {
 330         my $self = shift;
 331         my $xmlobj;
 332         if( $self->has_object ) {
 333                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 334                         throw( ident => "bad source",
 335                                    message => "Source object must be an XML::LibXML::Element (this is "
 336                                         . ref( $self->object ) . ");" );
 337                 }
 338                 $xmlobj = $self->object;
 339         } else {
 340                 my $parser = XML::LibXML->new();
 341                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 342                 try {
 343                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 344                 } catch( XML::LibXML::Error $e ) {
 345                         throw( ident => "bad source",
 346                                    message => "XML parsing error: " . $e->as_string );
 347                 }
 348         }
 349
 350         unless( $xmlobj->nodeName eq 'TEI' ) {
 351                 throw( ident => "bad source",
 352                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 353         }
 354
 355         # Set up the tags we need, with or without namespaces.
 356         map { $tags{$_} = "//$_" }
 357                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 358         # Set up our XPath object
 359         my $xpc = _xpc_for_el( $xmlobj );
 360         # Use namespace-aware tags if we have to
 361         if( $xmlobj->namespaceURI ) {
 362             map { $tags{$_} = "//tei:$_" } keys %tags;
 363         }
 364
 365         # Get the identifier
 366         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 367                 my $descnode = $desc->get_node(1);
 368                 $self->_save_msdesc( $descnode );
 369                 # First try to use settlement/repository/idno.
 370                 my( $setNode, $reposNode, $idNode ) =
 371                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 372                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 373                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 374                 $self->settlement( $setNode ? $setNode->textContent : '' );
 375                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 376                 $self->idno( $idNode ? $idNode->textContent : '' );
 377                 if( $self->settlement && $self->idno ) {
 378                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 379                 } else {
 380                     # Look for an msName.
 381                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 382                     if( $msNameNode ) {
 383                 $self->identifier( $msNameNode->textContent );
 384             } else {
 385                 # We have an msDesc but who knows what is in it?
 386                 my $desc = $descnode->textContent;
 387                 $desc =~ s/\n/ /gs;
 388                 $desc =~ s/\s+/ /g;
 389                 $self->identifier( $desc );
 390             }
 391         }
 392         if( $descnode->hasAttribute('xml:id') ) {
 393                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 394                 } elsif( !$self->has_sigil ) {
 395                         throw( ident => 'missing sigil',
 396                                    message => 'Could not find xml:id witness sigil' );
 397                 }
 398         } else {
 399             throw( ident => "bad source",
 400                    message => "Could not find manuscript description element in TEI header" );
 401         }
 402
 403         # Now get the words out.
 404         my @words;
 405         my @layerwords;  # if the witness has layers
 406         # First, make sure all the words are wrapped in tags.
 407         # TODO Make this not necessarily dependent upon whitespace...
 408         word_tag_wrap( $xmlobj );
 409         # Now go text hunting.
 410         my @textnodes;
 411         if( $self->use_text ) {
 412                 @textnodes = $xpc->findnodes( $self->use_text );
 413         } else {
 414                 # Use the first 'text' node in the document.
 415                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 416         }
 417         my $teitext = $textnodes[0];
 418         if( $teitext ) {
 419                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 420         } else {
 421             throw( ident => "bad source",
 422                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 423         }
 424
 425         $self->path( \@words );
 426         my $a = join( ' ', map { $_->text } @words );
 427         my $b = join( ' ', map { $_->text } @layerwords );
 428         if( $a ne $b ) {
 429                 $self->uncorrected_path( \@layerwords );
 430         }
 431         # TODO set self->text
 432 }
 433
 434 sub _tokenize_text {
 435         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 436         # Strip out the words.
 437         my $xpc = _xpc_for_el( $teitext );
 438         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 439         foreach( @divs ) {
 440                 my $place_str;
 441                 if( my $n = $_->getAttribute( 'n' ) ) {
 442                         $place_str = '#DIV_' . $n . '#';
 443                 } else {
 444                         $place_str = '#DIV#';
 445                 }
 446                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 447         }  # foreach <div/>
 448
 449         # But maybe we don't have any divs.  Just paragraphs.
 450         unless( @divs ) {
 451                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 452         }
 453 }
 454
 455 sub _objectify_words {
 456         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 457
 458         my $xpc = _xpc_for_el( $element );
 459         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 460         my @pgraphs = $xpc->findnodes( $xpexpr );
 461     return () unless @pgraphs;
 462     # Set up an expression to look for words and segs
 463     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 464         foreach my $pg ( @pgraphs ) {
 465                 # If this paragraph is the descendant of a note element,
 466                 # skip it.
 467                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 468                 next if scalar @noop_container;
 469                 # Get the text of each node
 470                 my $first_word = 1;
 471                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 472                 # of it, if necessary.
 473                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 474                         my( $text, $uncorr ) = _get_word_object( $c );
 475 #                       try {
 476 #                               ( $text, $uncorr ) = _get_word_object( $c );
 477 #                       } catch( Text::Tradition::Error $e
 478 #                                               where { $_->has_tag( 'lb' ) } ) {
 479 #                               next;
 480 #                       }
 481                         unless( defined $text || defined $uncorr ) {
 482                                 print STDERR "WARNING: no text in node " . $c->nodeName
 483                                         . "\n" unless $c->nodeName eq 'lb';
 484                                 next;
 485                         }
 486                         print STDERR "DEBUG: space found in element node "
 487                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 488
 489                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 490                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 491                                 $ctr++;
 492                         }
 493                         my $id = $self->sigil . 'r' . $ctr;
 494                         my( $word, $acword );
 495                         if( $text ) {
 496                                 $word = $self->tradition->collation->add_reading(
 497                                         { 'id' => $id, 'text' => $text });
 498                         }
 499                         if( $uncorr && $uncorr ne $text ) {
 500                                 $id .= '_ac';
 501                                 $acword = $self->tradition->collation->add_reading(
 502                                         { 'id' => $id, 'text' => $uncorr });
 503                         } elsif( $uncorr ) {
 504                                 $acword = $word;
 505                         }
 506
 507 #                       if( $first_word ) {
 508 #                               $first_word = 0;
 509 #                               # Set the relevant sectioning markers
 510 #                               if( $divmarker ) {
 511 #                                       $w->add_placeholder( $divmarker );
 512 #                                       $divmarker = undef;
 513 #                               }
 514 #                               $w->add_placeholder( '#PG#' );
 515 #                       }
 516                         push( @$wordlist, $word ) if $word;
 517                         push( @$uncorrlist, $acword ) if $acword;
 518                 }
 519     }
 520 }
 521
 522 # Given a word or segment node, make a Reading object for the word
 523 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 524 # 'corrected' state.
 525
 526 sub _get_word_strings {
 527         my( $node ) = @_;
 528         my( $text, $uncorrtext );
 529         # We can have an lb or pb in the middle of a word; if we do, the
 530         # whitespace (including \n) after the break becomes insignificant
 531         # and we want to nuke it.
 532         my $strip_leading_space = 0;
 533         my $word_excluded = 0;
 534         my $xpc = _xpc_for_el( $node );
 535         # TODO This does not cope with nested add/dels.
 536         my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} );
 537         my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} );
 538         foreach my $c ($node->childNodes() ) {
 539                 if( $c->nodeName eq 'num'
 540                         && defined $c->getAttribute( 'value' ) ) {
 541                         # Push the number.
 542                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 543                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 544                         # If this is just after a line/page break, return to normal behavior.
 545                         $strip_leading_space = 0;
 546                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 547                         # Set a flag that strips leading whitespace until we
 548                         # get to the next bit of non-whitespace.
 549                         $strip_leading_space = 1;
 550                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 551                                   || $c->nodeName eq 'sic'
 552                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 553                                   || $c->textContent eq ''
 554                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 555                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 556                         next;
 557                 } elsif( $c->nodeName eq 'add' ) {
 558                         my( $use, $discard ) = _get_text_from_node( $c );
 559                         $text .= $use;
 560                 } elsif( $c->nodeName eq 'del' ) {
 561                         my( $discard, $use ) = _get_text_from_node( $c );
 562                         $uncorrtext .= $use;
 563                 } else {
 564                         my $tagtxt;
 565                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 566                                 # A text node.
 567                                 $tagtxt = $c->textContent;
 568                         } else {
 569                                 $tagtxt = _get_text_from_node( $c );
 570                         }
 571                         if( $strip_leading_space ) {
 572                                 $tagtxt =~ s/^[\s\n]+//s;
 573                                 # Unset the flag as soon as we see non-whitespace.
 574                                 $strip_leading_space = 0 if $tagtxt;
 575                         }
 576                         $text .= $tagtxt;
 577                         $uncorrtext .= $tagtxt;
 578                 }
 579         }
 580         throw( ident => "text not found",
 581                tags => [ $node->nodeName ],
 582                message => "No text found in node " . $node->toString(0) )
 583             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 584         return( $text, $uncorrtext );
 585 }
 586
 587 sub _split_words {
 588         my( $self, $string, $c ) = @_;
 589         my @raw_words = split( /\s+/, $string );
 590         my @words;
 591         foreach my $w ( @raw_words ) {
 592                 my $id = $self->sigil . 'r'. $c++;
 593                 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
 594                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 595                 # Skip any words that have been canonized out of existence.
 596                 next if( length( $w_obj->text ) == 0 );
 597                 push( @words, $w_obj );
 598         }
 599         return @words;
 600 }
 601
 602 sub _init_from_json {
 603         my( $self ) = shift;
 604         my $wit;
 605         if( $self->has_object ) {
 606                 $wit = $self->object;
 607         } elsif( $self->has_string ) {
 608                 $wit = from_json( $self->string );
 609         } elsif( $self->has_file ) {
 610         my $ok = open( INPUT, $self->file );
 611         unless( $ok ) {
 612                         throw( ident => "bad source",
 613                                    message => 'Could not open ' . $self->file . ' for reading' );
 614         }
 615         binmode( INPUT, ':encoding(UTF-8)' );
 616         my @lines = <INPUT>;
 617         close INPUT;
 618         $wit = from_json( join( '', @lines ) );
 619         }
 620
 621         if( exists $wit->{'id'} ) {
 622                 $self->_set_sigil( $wit->{'id'} );
 623         } elsif( !$self->has_sigil ) {
 624                 throw( ident => 'missing sigil',
 625                            message => 'Could not find witness sigil (id) in JSON spec' );
 626         }
 627         $self->identifier( $wit->{'name'} );
 628         my @words;
 629         my @layerwords;
 630         my( @text, @layertext );
 631         if( exists $wit->{'content'} ) {
 632                 # We need to tokenize the text ourselves.
 633                 @words = _split_words( $self, $wit->{'content'} );
 634         } elsif( exists $wit->{'tokens'} ) {
 635                 # We have a bunch of pretokenized words.
 636                 my $ctr = 0;
 637                 foreach my $token ( @{$wit->{'tokens'}} ) {
 638                         my $w_obj = $self->tradition->collation->add_reading({
 639                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 640                         push( @words, $w_obj );
 641                         push( @text, $token->{'t'} ); # TODO unless...?
 642                 }
 643                 ## TODO rethink this JSOn mechanism
 644                 if( exists $wit->{'layertokens'} ) {
 645                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 646                                 my $w_obj = $self->tradition->collation->add_reading({
 647                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 648                                 push( @layerwords, $w_obj );
 649                                 push( @layertext, $token->{'t'} );
 650                         }
 651                 }
 652         }
 653         $self->text( \@text );
 654         $self->layertext( \@layertext ) if @layertext;
 655         $self->path( \@words );
 656         $self->uncorrected_path( \@layerwords ) if @layerwords;
 657 }
 658
 659 sub _init_from_plaintext {
 660     my( $self ) = @_;
 661     my $str;
 662     if( $self->has_file ) {
 663         my $ok = open( INPUT, $self->file );
 664         unless( $ok ) {
 665                         throw( ident => "bad source",
 666                                    message => 'Could not open ' . $self->file . ' for reading' );
 667         }
 668         binmode( INPUT, ':encoding(UTF-8)' );
 669         my @lines = <INPUT>;
 670         close INPUT;
 671         $str = join( '', @lines );
 672     } elsif( $self->has_object ) { # ...seriously?
 673         $str = ${$self->object};
 674     } else {
 675         $str = $self->string;
 676     }
 677
 678     # TODO allow a different word separation expression
 679     my @text = split( /\s+/, $str );
 680     $self->text( \@text );
 681     my @words = _split_words( $self, $str );
 682         $self->path( \@words );
 683 }
 684
 685 sub throw {
 686         Text::Tradition::Error->throw(
 687                 'ident' => 'Witness parsing error',
 688                 'message' => $_[0],
 689                 );
 690 }
 691
 692 sub _xpc_for_el {
 693         my $el = shift;
 694         my $xpc = XML::LibXML::XPathContext->new( $el );
 695                 if( $el->namespaceURI ) {
 696                         $xpc->registerNs( 'tei', $el->namespaceURI );
 697                 }
 698         return $xpc;
 699 }
 700
 701 =head2 export_as_json
 702
 703 Exports the witness as a JSON structure, with the following keys:
 704
 705 =over 4
 706
 707 =item * id - The witness sigil
 708
 709 =item * name - The witness identifier
 710
 711 =item * tokens - An array of hashes of the form { "t":"WORD" }
 712
 713 =back
 714
 715 =begin testing
 716
 717 use Text::Tradition;
 718 my $trad = Text::Tradition->new();
 719
 720 my @text = qw/ Thhis is a line of text /;
 721 my $wit = $trad->add_witness(
 722     'sigil' => 'A',
 723     'string' => join( ' ', @text ),
 724     'sourcetype' => 'plaintext',
 725     'identifier' => 'test witness',
 726      );
 727 my $jsonstruct = $wit->export_as_json;
 728 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 729 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 730 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 731 foreach my $idx ( 0 .. $#text ) {
 732         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 733 }
 734
 735 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 736                                 has pierced unto the root );
 737 $trad = Text::Tradition->new(
 738         'input' => 'CollateX',
 739         'file' => 't/data/Collatex-16.xml' );
 740
 741 $jsonstruct = $trad->witness('A')->export_as_json;
 742 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 743 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 744 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 745 foreach my $idx ( 0 .. $#ctext ) {
 746         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 747 }
 748
 749 ## TODO test layertext export
 750
 751 =end testing
 752
 753 =cut
 754
 755 sub export_as_json {
 756         my $self = shift;
 757         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 758         my $obj =  {
 759                 'id' => $self->sigil,
 760                 'tokens' => \@wordlist,
 761                 'name' => $self->identifier,
 762         };
 763         if( $self->is_layered ) {
 764                 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
 765                 $obj->{'layertokens'} = \@lwlist;
 766         }
 767         return $obj;
 768 }
 769
 770 no Moose;
 771 __PACKAGE__->meta->make_immutable;
 772
 773 =head1 BUGS / TODO
 774
 775 =over
 776
 777 =item * Support encodings other than UTF-8
 778
 779 =back
 780
 781 =head1 LICENSE
 782
 783 This package is free software and is provided "as is" without express
 784 or implied warranty.  You can redistribute it and/or modify it under
 785 the same terms as Perl itself.
 786
 787 =head1 AUTHOR
 788
 789 Tara L Andrews E<lt>aurum@cpan.orgE<gt>