base/lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Text::Tradition::Datatypes;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Witness - a manuscript witness to a text tradition
  13
  14 =head1 SYNOPSIS
  15
  16   use Text::Tradition::Witness;
  17   my $w = Text::Tradition::Witness->new(
  18     'sigil' => 'A',
  19     'identifier' => 'Oxford MS Ex.1932',
  20     );
  21
  22 =head1 DESCRIPTION
  23
  24 Text::Tradition::Witness is an object representation of a manuscript
  25 witness to a text tradition.  A manuscript has a sigil (a short code that
  26 represents it in the wider tradition), an identifier (e.g. the library ID),
  27 and probably a text.
  28
  29 =head1 METHODS
  30
  31 =head2 new
  32
  33 Create a new witness.  Options include:
  34
  35 =over
  36
  37 =item * sigil - A short code to represent the manuscript.  Required.
  38
  39 =item * sourcetype - What sort of witness data this is. Options are
  40 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  41 used by Collation parsers.)
  42
  43 =item * file
  44 =item * string
  45 =item * object
  46
  47 The data source for the witness.  Use the appropriate option.
  48
  49 =item * use_text - An initialization option.  If the witness is read from a
  50 TEI document and more than one <text/> tag exists therein, the default
  51 behavior is to use the first defined text.  If this is not desired,
  52 use_text should be set to an XPath expression that will select the correct
  53 text.
  54
  55 =item * identifier - The recognized name of the manuscript, e.g. a library
  56 identifier. Taken from the msDesc element for a TEI file.
  57
  58 =item * other_info - A freeform string for any other description of the
  59 manuscript.
  60
  61 =back
  62
  63 =head2 sigil
  64
  65 The sigil by which to identify this manuscript, which must conform to the
  66 specification for XML attribute strings (broadly speaking, it must begin
  67 with a letter and can have only a few sorts of punctuation characters in
  68 it.)
  69
  70 =head2 identifier
  71
  72 A freeform name by which to identify the manuscript, which may be longer
  73 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  74 TEI msName attribute, or constructed from the settlement and idno if
  75 supplied.
  76
  77 =head2 settlement
  78
  79 The city, town, etc. where the manuscript is held. Will be read from the
  80 TEI msDesc element if supplied.
  81
  82 =head2 repository
  83
  84 The institution that holds the manuscript. Will be read from the TEI msDesc
  85 element if supplied.
  86
  87 =head2 idno
  88
  89 The identification or call number of the manuscript.  Will be read from the
  90 TEI msDesc element if supplied.
  91
  92 =head2 text
  93
  94 An array of strings (words) that contains the text of the
  95 manuscript.  This should not change after the witness has been
  96 instantiated, and the path through the collation should always match it.
  97
  98 =head2 layertext
  99
 100 An array of strings (words) that contains the layered
 101 text, if any, of the manuscript.  This should not change after the witness
 102 has been instantiated, and the path through the collation should always
 103 match it.
 104
 105 =head2 identifier
 106
 107 Accessor method for the witness identifier.
 108
 109 =head2 other_info
 110
 111 Accessor method for the general witness description.
 112
 113 =head2 has_source
 114
 115 Boolean method that returns a true value if the witness was created with a
 116 data source (that is, a file, string, or object to be parsed).
 117
 118 =head2 is_layered
 119
 120 Boolean method to note whether the witness has layers (e.g. pre-correction
 121 readings) in the collation.
 122
 123 =begin testing
 124
 125 use Text::Tradition;
 126 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 127 my $c = $trad->collation;
 128
 129 # Test a plaintext witness via string
 130 my $str = 'This is a line of text';
 131 my $ptwit = $trad->add_witness(
 132     'sigil' => 'A',
 133     'sourcetype' => 'plaintext',
 134     'string' => $str
 135      );
 136 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 137 if( $ptwit ) {
 138     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 139     $c->make_witness_path( $ptwit );
 140     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 141 }
 142
 143 # Test some JSON witnesses via object
 144 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 145 binmode( JSIN, ':encoding(UTF-8)' );
 146 my @lines = <JSIN>;
 147 close JSIN;
 148 $trad->add_json_witnesses( join( '', @lines ) );
 149 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 150         "Found first JSON witness" );
 151 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 152         "Found second JSON witness" );
 153
 154 # Test an XML witness via file
 155 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 156         'file' => 't/data/witnesses/teiwit.xml' );
 157 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 158 if( $xmlwit ) {
 159         is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 160         ok( $xmlwit->is_layered, "Picked up correction layer" );
 161         is( @{$xmlwit->text}, 182, "Got correct text length" );
 162         is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
 163 }
 164 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
 165 is( @allwitwords, 184, "Reused appropriate readings" );
 166
 167 ## Test use_text
 168 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 169         'file' => 't/data/witnesses/group.xml',
 170         'use_text' => '//tei:group/tei:text[2]' );
 171 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
 172 if( $xpwit ) {
 173         is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
 174         ok( !$xpwit->is_layered, "Picked up no correction layer" );
 175         is( @{$xpwit->text}, 157, "Got correct text length" );
 176 }
 177
 178
 179 =end testing
 180
 181 =cut
 182
 183 # Enable plugin(s) if available
 184 eval { with 'Text::Tradition::WitLanguage'; };
 185
 186 has 'tradition' => (
 187         is => 'ro',
 188         isa => 'Text::Tradition',
 189         required => 1,
 190         weak_ref => 1
 191         );
 192
 193 # Sigil. Required identifier for a witness, but may be found inside
 194 # the XML file.
 195 has 'sigil' => (
 196         is => 'ro',
 197         isa => 'Sigil',
 198         predicate => 'has_sigil',
 199         writer => '_set_sigil',
 200         );
 201
 202 # Other identifying information
 203 has 'identifier' => (
 204         is => 'rw',
 205         isa => 'Str',
 206         );
 207
 208 has 'settlement' => (
 209         is => 'rw',
 210         isa => 'Str',
 211         );
 212
 213 has 'repository' => (
 214         is => 'rw',
 215         isa => 'Str',
 216         );
 217
 218 has 'idno' => (
 219         is => 'rw',
 220         isa => 'Str',
 221         );
 222
 223 # Source. Can be XML obj, JSON data struct, or string.
 224 # Not used if the witness is created by parsing a collation.
 225 has 'sourcetype' => (
 226         is => 'ro',
 227         isa => 'SourceType',
 228         required => 1,
 229 );
 230
 231 has 'file' => (
 232         is => 'ro',
 233         isa => 'Str',
 234         predicate => 'has_file',
 235 );
 236
 237 has 'string' => (
 238         is => 'ro',
 239         isa => 'Str',
 240         predicate => 'has_string',
 241 );
 242
 243 has 'object' => ( # could be anything.
 244         is => 'ro',
 245         predicate => 'has_object',
 246         clearer => 'clear_object',
 247 );
 248
 249 # In the case of a TEI document with multiple texts, specify
 250 # which text is the root. Should be an XPath expression.
 251 has 'use_text' => (
 252         is => 'ro',
 253         isa => 'Str',
 254         );
 255
 256 # Text.  This is an array of strings (i.e. word tokens).
 257 # TODO Think about how to handle this for the case of pre-prepared
 258 # collations, where the tokens are in the graph already.
 259 has 'text' => (
 260         is => 'rw',
 261         isa => 'ArrayRef[Str]',
 262         predicate => 'has_text',
 263         );
 264
 265 has 'layertext' => (
 266         is => 'rw',
 267         isa => 'ArrayRef[Str]',
 268         predicate => 'has_layertext',
 269         );
 270
 271 has 'is_collated' => (
 272         is => 'rw',
 273         isa => 'Bool'
 274         );
 275
 276 # Path.  This is an array of Reading nodes that can be saved during
 277 # initialization, but should be cleared before saving in a DB.
 278 has 'path' => (
 279         is => 'rw',
 280         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 281         predicate => 'has_path',
 282         clearer => 'clear_path',
 283         );
 284
 285 ## TODO change the name of this
 286 has 'uncorrected_path' => (
 287         is => 'rw',
 288         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 289         clearer => 'clear_uncorrected_path',
 290         );
 291
 292 has 'is_layered' => (
 293         is => 'rw',
 294         isa => 'Bool',
 295         );
 296
 297 # If we set an uncorrected path, ever, remember that we did so.
 298 around 'uncorrected_path' => sub {
 299         my $orig = shift;
 300         my $self = shift;
 301
 302         $self->is_layered( 1 );
 303         $self->$orig( @_ );
 304 };
 305
 306 sub BUILD {
 307         my $self = shift;
 308         if( $self->has_source ) {
 309                 my $init_sub = '_init_from_' . $self->sourcetype;
 310                 $self->$init_sub();
 311                 # Remove our XML / source objects; we no longer need them.
 312                 $self->clear_object if $self->has_object;
 313                 # $self->tradition->collation->make_witness_path( $self );
 314         }
 315         if( $self->sourcetype eq 'collation' ) {
 316                 $self->is_collated( 1 );
 317         }
 318         return $self;
 319 }
 320
 321 sub has_source {
 322         my $self = shift;
 323         return $self->has_file || $self->has_string || $self->has_object;
 324 }
 325
 326 sub _init_from_xmldesc {
 327         my $self = shift;
 328         my $xmlobj;
 329         if( $self->has_object ) {
 330                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 331                         throw( ident => "bad source",
 332                                    message => "Source object must be an XML::LibXML::Element (this is "
 333                                         . ref( $self->object ) . ");" );
 334                 }
 335                 $xmlobj = $self->object;
 336         } else {
 337                 require XML::LibXML;
 338                 my $parser = XML::LibXML->new();
 339                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 340                 try {
 341                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 342                 } catch( XML::LibXML::Error $e ) {
 343                         throw( ident => "bad source",
 344                                    message => "XML parsing error: " . $e->as_string );
 345                 }
 346         }
 347
 348         unless( $xmlobj->nodeName eq 'TEI' ) {
 349                 throw( ident => "bad source",
 350                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 351         }
 352
 353         # Set up the tags we need, with or without namespaces.
 354         map { $tags{$_} = "//$_" }
 355                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 356         # Set up our XPath object
 357         my $xpc = _xpc_for_el( $xmlobj );
 358         # Use namespace-aware tags if we have to
 359         if( $xmlobj->namespaceURI ) {
 360             map { $tags{$_} = "//tei:$_" } keys %tags;
 361         }
 362
 363         # Get the identifier
 364         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 365                 my $descnode = $desc->get_node(1);
 366                 # First try to use settlement/repository/idno.
 367                 my( $setNode, $reposNode, $idNode ) =
 368                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 369                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 370                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 371                 $self->settlement( $setNode ? $setNode->textContent : '' );
 372                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 373                 $self->idno( $idNode ? $idNode->textContent : '' );
 374                 if( $self->settlement && $self->idno ) {
 375                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 376                 } else {
 377                     # Look for an msName.
 378                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 379                     if( $msNameNode ) {
 380                 $self->identifier( $msNameNode->textContent );
 381             } else {
 382                 # We have an msDesc but who knows what is in it?
 383                 my $desc = $descnode->textContent;
 384                 $desc =~ s/\n/ /gs;
 385                 $desc =~ s/\s+/ /g;
 386                 $self->identifier( $desc );
 387             }
 388         }
 389         if( $descnode->hasAttribute('xml:id') ) {
 390                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 391                 } elsif( !$self->has_sigil ) {
 392                         throw( ident => 'missing sigil',
 393                                    message => 'Could not find xml:id witness sigil' );
 394                 }
 395         } else {
 396             throw( ident => "bad source",
 397                    message => "Could not find manuscript description element in TEI header" );
 398         }
 399
 400         # Now get the words out.
 401         my @words;
 402         my @layerwords;  # if the witness has layers
 403         # First, make sure all the words are wrapped in tags.
 404         # TODO Make this not necessarily dependent upon whitespace...
 405         word_tag_wrap( $xmlobj );
 406         # Now go text hunting.
 407         my @textnodes;
 408         if( $self->use_text ) {
 409                 @textnodes = $xpc->findnodes( $self->use_text );
 410         } else {
 411                 # Use the first 'text' node in the document.
 412                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 413         }
 414         my $teitext = $textnodes[0];
 415         if( $teitext ) {
 416                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 417         } else {
 418             throw( ident => "bad source",
 419                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 420         }
 421
 422         my @text = map { $_->text } @words;
 423         my @layertext = map { $_->text } @layerwords;
 424         $self->path( \@words );
 425         $self->text( \@text );
 426         if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 427                 $self->uncorrected_path( \@layerwords );
 428                 $self->layertext( \@layertext );
 429         }
 430 }
 431
 432 sub _tokenize_text {
 433         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 434         # Strip out the words.
 435         my $xpc = _xpc_for_el( $teitext );
 436         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 437         foreach( @divs ) {
 438                 my $place_str;
 439                 if( my $n = $_->getAttribute( 'n' ) ) {
 440                         $place_str = '#DIV_' . $n . '#';
 441                 } else {
 442                         $place_str = '#DIV#';
 443                 }
 444                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 445         }  # foreach <div/>
 446
 447         # But maybe we don't have any divs.  Just paragraphs.
 448         unless( @divs ) {
 449                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 450         }
 451 }
 452
 453 sub _objectify_words {
 454         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 455
 456         my $xpc = _xpc_for_el( $element );
 457         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 458         my @pgraphs = $xpc->findnodes( $xpexpr );
 459     return () unless @pgraphs;
 460     # Set up an expression to look for words and segs
 461     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 462         foreach my $pg ( @pgraphs ) {
 463                 # If this paragraph is the descendant of a note element,
 464                 # skip it.
 465                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 466                 next if scalar @noop_container;
 467                 # Get the text of each node
 468                 my $first_word = 1;
 469                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 470                 # of it, if necessary.
 471                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 472                         my( $text, $uncorr ) = _get_word_strings( $c );
 473 #                       try {
 474 #                               ( $text, $uncorr ) = _get_word_object( $c );
 475 #                       } catch( Text::Tradition::Error $e
 476 #                                               where { $_->has_tag( 'lb' ) } ) {
 477 #                               next;
 478 #                       }
 479                         unless( defined $text || defined $uncorr ) {
 480                                 print STDERR "WARNING: no text in node " . $c->nodeName
 481                                         . "\n" unless $c->nodeName eq 'lb';
 482                                 next;
 483                         }
 484                         print STDERR "DEBUG: space found in element node "
 485                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 486
 487                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 488                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 489                                 $ctr++;
 490                         }
 491                         my $id = $self->sigil . 'r' . $ctr;
 492                         my( $word, $acword );
 493                         if( $text ) {
 494                                 $word = $self->tradition->collation->add_reading(
 495                                         { 'id' => $id, 'text' => $text });
 496                         }
 497                         if( $uncorr && $uncorr ne $text ) {
 498                                 $id .= '_ac';
 499                                 $acword = $self->tradition->collation->add_reading(
 500                                         { 'id' => $id, 'text' => $uncorr });
 501                         } elsif( $uncorr ) {
 502                                 $acword = $word;
 503                         }
 504
 505 #                       if( $first_word ) {
 506 #                               $first_word = 0;
 507 #                               # Set the relevant sectioning markers
 508 #                               if( $divmarker ) {
 509 #                                       $w->add_placeholder( $divmarker );
 510 #                                       $divmarker = undef;
 511 #                               }
 512 #                               $w->add_placeholder( '#PG#' );
 513 #                       }
 514                         push( @$wordlist, $word ) if $word;
 515                         push( @$uncorrlist, $acword ) if $acword;
 516                 }
 517     }
 518 }
 519
 520 # Given a word or segment node, make a Reading object for the word
 521 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 522 # 'corrected' state.
 523
 524 sub _get_word_strings {
 525         my( $node ) = @_;
 526         my( $text, $uncorrtext );
 527         # We can have an lb or pb in the middle of a word; if we do, the
 528         # whitespace (including \n) after the break becomes insignificant
 529         # and we want to nuke it.
 530         my $strip_leading_space = 0;
 531         my $word_excluded = 0;
 532         my $xpc = _xpc_for_el( $node );
 533         # TODO This does not cope with nested add/dels.
 534         my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
 535         my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 536         foreach my $c ($node->childNodes() ) {
 537                 if( $c->nodeName eq 'num'
 538                         && defined $c->getAttribute( 'value' ) ) {
 539                         # Push the number.
 540                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 541                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 542                         # If this is just after a line/page break, return to normal behavior.
 543                         $strip_leading_space = 0;
 544                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 545                         # Set a flag that strips leading whitespace until we
 546                         # get to the next bit of non-whitespace.
 547                         $strip_leading_space = 1;
 548                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 549                                   || $c->nodeName eq 'sic'
 550                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 551                                   || $c->textContent eq ''
 552                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 553                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 554                         next;
 555                 } elsif( $c->nodeName eq 'add' ) {
 556                         my( $use, $discard ) = _get_word_strings( $c );
 557                         $text .= $use;
 558                 } elsif( $c->nodeName eq 'del' ) {
 559                         my( $discard, $use ) = _get_word_strings( $c );
 560                         $uncorrtext .= $use;
 561                 } else {
 562                         my ( $tagtxt, $taguncorr );
 563                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 564                                 # A text node.
 565                                 $tagtxt = $c->textContent;
 566                                 $taguncorr = $c->textContent;
 567                         } else {
 568                                 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 569                         }
 570                         if( $strip_leading_space ) {
 571                                 $tagtxt =~ s/^[\s\n]+//s;
 572                                 $taguncorr =~ s/^[\s\n]+//s;
 573                                 # Unset the flag as soon as we see non-whitespace.
 574                                 $strip_leading_space = 0 if $tagtxt;
 575                         }
 576                         $text .= $tagtxt;
 577                         $uncorrtext .= $taguncorr;
 578                 }
 579         }
 580         throw( ident => "text not found",
 581                tags => [ $node->nodeName ],
 582                message => "No text found in node " . $node->toString(0) )
 583             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 584         return( $text, $uncorrtext );
 585 }
 586
 587 sub _split_words {
 588         my( $self, $string, $c ) = @_;
 589         my @raw_words = split( /\s+/, $string );
 590         my @words;
 591         foreach my $w ( @raw_words ) {
 592                 my $id = $self->sigil . 'r'. $c++;
 593                 my %opts = ( 'text' => $w, 'id' => $id );
 594                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 595                 # Skip any words that have been canonized out of existence.
 596                 next if( length( $w_obj->text ) == 0 );
 597                 push( @words, $w_obj );
 598         }
 599         return @words;
 600 }
 601
 602 sub _init_from_json {
 603         my( $self ) = shift;
 604         my $wit;
 605         if( $self->has_object ) {
 606                 $wit = $self->object;
 607         } elsif( $self->has_string ) {
 608                 $wit = from_json( $self->string );
 609         } elsif( $self->has_file ) {
 610         my $ok = open( INPUT, $self->file );
 611         unless( $ok ) {
 612                         throw( ident => "bad source",
 613                                    message => 'Could not open ' . $self->file . ' for reading' );
 614         }
 615         binmode( INPUT, ':encoding(UTF-8)' );
 616         my @lines = <INPUT>;
 617         close INPUT;
 618         $wit = from_json( join( '', @lines ) );
 619         }
 620
 621         if( exists $wit->{'id'} ) {
 622                 $self->_set_sigil( $wit->{'id'} );
 623         } elsif( !$self->has_sigil ) {
 624                 throw( ident => 'missing sigil',
 625                            message => 'Could not find witness sigil (id) in JSON spec' );
 626         }
 627         $self->identifier( $wit->{'name'} );
 628         my @words;
 629         my @layerwords;
 630         my( @text, @layertext );
 631         if( exists $wit->{'content'} ) {
 632                 # We need to tokenize the text ourselves.
 633                 @words = _split_words( $self, $wit->{'content'} );
 634         } elsif( exists $wit->{'tokens'} ) {
 635                 # We have a bunch of pretokenized words.
 636                 my $ctr = 0;
 637                 foreach my $token ( @{$wit->{'tokens'}} ) {
 638                         my $w_obj = $self->tradition->collation->add_reading({
 639                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 640                         push( @words, $w_obj );
 641                         push( @text, $token->{'t'} ); # TODO unless...?
 642                 }
 643                 ## TODO rethink this JSOn mechanism
 644                 if( exists $wit->{'layertokens'} ) {
 645                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 646                                 my $w_obj = $self->tradition->collation->add_reading({
 647                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 648                                 push( @layerwords, $w_obj );
 649                                 push( @layertext, $token->{'t'} );
 650                         }
 651                 }
 652         }
 653         $self->text( \@text );
 654         $self->layertext( \@layertext ) if @layertext;
 655         $self->path( \@words );
 656         $self->uncorrected_path( \@layerwords ) if @layerwords;
 657 }
 658
 659 sub _init_from_plaintext {
 660     my( $self ) = @_;
 661     unless( $self->has_sigil ) {
 662         throw( "No sigil defined for the plaintext witness" );
 663     }
 664     my $str;
 665     if( $self->has_file ) {
 666         my $ok = open( INPUT, $self->file );
 667         unless( $ok ) {
 668                         throw( ident => "bad source",
 669                                    message => 'Could not open ' . $self->file . ' for reading' );
 670         }
 671         binmode( INPUT, ':encoding(UTF-8)' );
 672         my @lines = <INPUT>;
 673         close INPUT;
 674         $str = join( '', @lines );
 675     } elsif( $self->has_object ) { # ...seriously?
 676         $str = ${$self->object};
 677     } else {
 678         $str = $self->string;
 679     }
 680
 681     # TODO allow a different word separation expression
 682     my @text = split( /\s+/, $str );
 683     $self->text( \@text );
 684     my @words = _split_words( $self, $str );
 685         $self->path( \@words );
 686 }
 687
 688 sub throw {
 689         Text::Tradition::Error->throw(
 690                 'ident' => 'Witness parsing error',
 691                 'message' => $_[0],
 692                 );
 693 }
 694
 695 sub _xpc_for_el {
 696         my $el = shift;
 697         my $xpc = XML::LibXML::XPathContext->new( $el );
 698                 if( $el->namespaceURI ) {
 699                         $xpc->registerNs( 'tei', $el->namespaceURI );
 700                 }
 701         return $xpc;
 702 }
 703
 704 =head2 export_as_json
 705
 706 Exports the witness as a JSON structure, with the following keys:
 707
 708 =over 4
 709
 710 =item * id - The witness sigil
 711
 712 =item * name - The witness identifier
 713
 714 =item * tokens - An array of hashes of the form { "t":"WORD" }
 715
 716 =back
 717
 718 =begin testing
 719
 720 use Text::Tradition;
 721 my $trad = Text::Tradition->new();
 722
 723 my @text = qw/ Thhis is a line of text /;
 724 my $wit = $trad->add_witness(
 725     'sigil' => 'A',
 726     'string' => join( ' ', @text ),
 727     'sourcetype' => 'plaintext',
 728     'identifier' => 'test witness',
 729      );
 730 my $jsonstruct = $wit->export_as_json;
 731 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 732 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 733 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 734 foreach my $idx ( 0 .. $#text ) {
 735         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 736 }
 737
 738 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 739                                 has pierced unto the root );
 740 $trad = Text::Tradition->new(
 741         'input' => 'CollateX',
 742         'file' => 't/data/Collatex-16.xml' );
 743
 744 $jsonstruct = $trad->witness('A')->export_as_json;
 745 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 746 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 747 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 748 foreach my $idx ( 0 .. $#ctext ) {
 749         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 750 }
 751
 752 ## TODO test layertext export
 753
 754 =end testing
 755
 756 =cut
 757
 758 sub export_as_json {
 759         my $self = shift;
 760         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 761         my $obj =  {
 762                 'id' => $self->sigil,
 763                 'tokens' => \@wordlist,
 764                 'name' => $self->identifier,
 765         };
 766         if( $self->is_layered ) {
 767                 my @lwlist = map { { 't' => $_ || '' } } @{$self->layertext};
 768                 $obj->{'layertokens'} = \@lwlist;
 769         }
 770         return $obj;
 771 }
 772
 773 no Moose;
 774 __PACKAGE__->meta->make_immutable;
 775
 776 =head1 BUGS / TODO
 777
 778 =over
 779
 780 =item * Figure out how to serialize a witness
 781
 782 =item * Support encodings other than UTF-8
 783
 784 =back
 785
 786 =head1 LICENSE
 787
 788 This package is free software and is provided "as is" without express
 789 or implied warranty.  You can redistribute it and/or modify it under
 790 the same terms as Perl itself.
 791
 792 =head1 AUTHOR
 793
 794 Tara L Andrews E<lt>aurum@cpan.orgE<gt>