base/lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Moose::Util::TypeConstraints;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9 use XML::Easy::Syntax qw( $xml10_name_rx );
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Witness - a manuscript witness to a text tradition
  14
  15 =head1 SYNOPSIS
  16
  17   use Text::Tradition::Witness;
  18   my $w = Text::Tradition::Witness->new(
  19     'sigil' => 'A',
  20     'identifier' => 'Oxford MS Ex.1932',
  21     );
  22
  23 =head1 DESCRIPTION
  24
  25 Text::Tradition::Witness is an object representation of a manuscript
  26 witness to a text tradition.  A manuscript has a sigil (a short code that
  27 represents it in the wider tradition), an identifier (e.g. the library ID),
  28 and probably a text.
  29
  30 =head1 METHODS
  31
  32 =head2 new
  33
  34 Create a new witness.  Options include:
  35
  36 =over
  37
  38 =item * sigil - A short code to represent the manuscript.  Required.
  39
  40 =item * sourcetype - What sort of witness data this is. Options are
  41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  42 used by Collation parsers.)
  43
  44 =item * file
  45 =item * string
  46 =item * object
  47
  48 The data source for the witness.  Use the appropriate option.
  49
  50 =item * use_text - An initialization option.  If the witness is read from a
  51 TEI document and more than one <text/> tag exists therein, the default
  52 behavior is to use the first defined text.  If this is not desired,
  53 use_text should be set to an XPath expression that will select the correct
  54 text.
  55
  56 =item * identifier - The recognized name of the manuscript, e.g. a library
  57 identifier. Taken from the msDesc element for a TEI file.
  58
  59 =item * other_info - A freeform string for any other description of the
  60 manuscript.
  61
  62 =back
  63
  64 =head2 sigil
  65
  66 The sigil by which to identify this manuscript, which must conform to the
  67 specification for XML attribute strings (broadly speaking, it must begin
  68 with a letter and can have only a few sorts of punctuation characters in
  69 it.)
  70
  71 =head2 identifier
  72
  73 A freeform name by which to identify the manuscript, which may be longer
  74 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  75 TEI msName attribute, or constructed from the settlement and idno if
  76 supplied.
  77
  78 =head2 settlement
  79
  80 The city, town, etc. where the manuscript is held. Will be read from the
  81 TEI msDesc element if supplied.
  82
  83 =head2 repository
  84
  85 The institution that holds the manuscript. Will be read from the TEI msDesc
  86 element if supplied.
  87
  88 =head2 idno
  89
  90 The identification or call number of the manuscript.  Will be read from the
  91 TEI msDesc element if supplied.
  92
  93 =head2 text
  94
  95 An array of strings (words) that contains the text of the
  96 manuscript.  This should not change after the witness has been
  97 instantiated, and the path through the collation should always match it.
  98
  99 =head2 layertext
 100
 101 An array of strings (words) that contains the layered
 102 text, if any, of the manuscript.  This should not change after the witness
 103 has been instantiated, and the path through the collation should always
 104 match it.
 105
 106 =head2 identifier
 107
 108 Accessor method for the witness identifier.
 109
 110 =head2 other_info
 111
 112 Accessor method for the general witness description.
 113
 114 =head2 has_source
 115
 116 Boolean method that returns a true value if the witness was created with a
 117 data source (that is, a file, string, or object to be parsed).
 118
 119 =head2 is_layered
 120
 121 Boolean method to note whether the witness has layers (e.g. pre-correction
 122 readings) in the collation.
 123
 124 =begin testing
 125
 126 use Text::Tradition;
 127 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 128 my $c = $trad->collation;
 129
 130 # Test a plaintext witness via string
 131 my $str = 'This is a line of text';
 132 my $ptwit = $trad->add_witness(
 133     'sigil' => 'A',
 134     'sourcetype' => 'plaintext',
 135     'string' => $str
 136      );
 137 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 138 if( $ptwit ) {
 139     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 140     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 141 }
 142
 143 # Test some JSON witnesses via object
 144 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 145 binmode( JSIN, ':encoding(UTF-8)' );
 146 my @lines = <JSIN>;
 147 close JSIN;
 148 $trad->add_json_witnesses( join( '', @lines ) );
 149 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 150         "Found first JSON witness" );
 151 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 152         "Found second JSON witness" );
 153
 154 # Test an XML witness via file
 155 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 156         'file' => 't/data/witnesses/teiwit.xml' );
 157 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 158 if( $xmlwit ) {
 159         is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 160         ok( $xmlwit->is_layered, "Picked up correction layer" );
 161         is( @{$xmlwit->text}, 182, "Got correct text length" );
 162         is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
 163 }
 164 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
 165 is( @allwitwords, 184, "Reused appropriate readings" );
 166
 167 ## Test use_text
 168 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 169         'file' => 't/data/witnesses/group.xml',
 170         'use_text' => '//tei:group/tei:text[2]' );
 171 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
 172 if( $xpwit ) {
 173         is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
 174         ok( !$xpwit->is_layered, "Picked up no correction layer" );
 175         is( @{$xpwit->text}, 157, "Got correct text length" );
 176 }
 177
 178
 179 =end testing
 180
 181 =cut
 182
 183 subtype 'SourceType',
 184         as 'Str',
 185         where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
 186         message { 'Source type must be one of xmldesc, plaintext, json, collation' };
 187
 188 subtype 'Sigil',
 189         as 'Str',
 190         where { $_ =~ /\A$xml10_name_rx\z/ },
 191         message { 'Sigil must be a valid XML attribute string' };
 192
 193 no Moose::Util::TypeConstraints;
 194
 195 has 'tradition' => (
 196         'is' => 'ro',
 197         'isa' => 'Text::Tradition',
 198         'required' => 1,
 199         weak_ref => 1
 200         );
 201
 202 # Sigil. Required identifier for a witness, but may be found inside
 203 # the XML file.
 204 has 'sigil' => (
 205         is => 'ro',
 206         isa => 'Sigil',
 207         predicate => 'has_sigil',
 208         writer => '_set_sigil',
 209         );
 210
 211 # Other identifying information
 212 has 'identifier' => (
 213         is => 'rw',
 214         isa => 'Str',
 215         );
 216
 217 has 'settlement' => (
 218         is => 'rw',
 219         isa => 'Str',
 220         );
 221
 222 has 'repository' => (
 223         is => 'rw',
 224         isa => 'Str',
 225         );
 226
 227 has 'idno' => (
 228         is => 'rw',
 229         isa => 'Str',
 230         );
 231
 232 # Source. Can be XML obj, JSON data struct, or string.
 233 # Not used if the witness is created by parsing a collation.
 234 has 'sourcetype' => (
 235         is => 'ro',
 236         isa => 'SourceType',
 237         required => 1,
 238 );
 239
 240 has 'file' => (
 241         is => 'ro',
 242         isa => 'Str',
 243         predicate => 'has_file',
 244 );
 245
 246 has 'string' => (
 247         is => 'ro',
 248         isa => 'Str',
 249         predicate => 'has_string',
 250 );
 251
 252 has 'object' => ( # could be anything.
 253         is => 'ro',
 254         predicate => 'has_object',
 255         clearer => 'clear_object',
 256 );
 257
 258 # In the case of a TEI document with multiple texts, specify
 259 # which text is the root. Should be an XPath expression.
 260 has 'use_text' => (
 261         is => 'ro',
 262         isa => 'Str',
 263         );
 264
 265 # Text.  This is an array of strings (i.e. word tokens).
 266 # TODO Think about how to handle this for the case of pre-prepared
 267 # collations, where the tokens are in the graph already.
 268 has 'text' => (
 269         is => 'rw',
 270         isa => 'ArrayRef[Str]',
 271         predicate => 'has_text',
 272         );
 273
 274 has 'layertext' => (
 275         is => 'rw',
 276         isa => 'ArrayRef[Str]',
 277         predicate => 'has_layertext',
 278         );
 279
 280 # Path.  This is an array of Reading nodes that can be saved during
 281 # initialization, but should be cleared before saving in a DB.
 282 has 'path' => (
 283         is => 'rw',
 284         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 285         predicate => 'has_path',
 286         clearer => 'clear_path',
 287         );
 288
 289 ## TODO change the name of this
 290 has 'uncorrected_path' => (
 291         is => 'rw',
 292         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 293         clearer => 'clear_uncorrected_path',
 294         );
 295
 296 has 'is_layered' => (
 297         is => 'rw',
 298         isa => 'Bool',
 299         );
 300
 301 # If we set an uncorrected path, ever, remember that we did so.
 302 around 'uncorrected_path' => sub {
 303         my $orig = shift;
 304         my $self = shift;
 305
 306         $self->is_layered( 1 );
 307         $self->$orig( @_ );
 308 };
 309
 310 sub BUILD {
 311         my $self = shift;
 312         if( $self->has_source ) {
 313                 my $init_sub = '_init_from_' . $self->sourcetype;
 314                 $self->$init_sub();
 315                 # Remove our XML / source objects; we no longer need them.
 316                 $self->clear_object if $self->has_object;
 317                 $self->tradition->collation->make_witness_path( $self );
 318         }
 319         return $self;
 320 }
 321
 322 sub has_source {
 323         my $self = shift;
 324         return $self->has_file || $self->has_string || $self->has_object;
 325 }
 326
 327 sub _init_from_xmldesc {
 328         my $self = shift;
 329         my $xmlobj;
 330         if( $self->has_object ) {
 331                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 332                         throw( ident => "bad source",
 333                                    message => "Source object must be an XML::LibXML::Element (this is "
 334                                         . ref( $self->object ) . ");" );
 335                 }
 336                 $xmlobj = $self->object;
 337         } else {
 338                 require XML::LibXML;
 339                 my $parser = XML::LibXML->new();
 340                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 341                 try {
 342                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 343                 } catch( XML::LibXML::Error $e ) {
 344                         throw( ident => "bad source",
 345                                    message => "XML parsing error: " . $e->as_string );
 346                 }
 347         }
 348
 349         unless( $xmlobj->nodeName eq 'TEI' ) {
 350                 throw( ident => "bad source",
 351                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 352         }
 353
 354         # Set up the tags we need, with or without namespaces.
 355         map { $tags{$_} = "//$_" }
 356                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 357         # Set up our XPath object
 358         my $xpc = _xpc_for_el( $xmlobj );
 359         # Use namespace-aware tags if we have to
 360         if( $xmlobj->namespaceURI ) {
 361             map { $tags{$_} = "//tei:$_" } keys %tags;
 362         }
 363
 364         # Get the identifier
 365         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 366                 my $descnode = $desc->get_node(1);
 367                 # First try to use settlement/repository/idno.
 368                 my( $setNode, $reposNode, $idNode ) =
 369                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 370                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 371                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 372                 $self->settlement( $setNode ? $setNode->textContent : '' );
 373                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 374                 $self->idno( $idNode ? $idNode->textContent : '' );
 375                 if( $self->settlement && $self->idno ) {
 376                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 377                 } else {
 378                     # Look for an msName.
 379                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 380                     if( $msNameNode ) {
 381                 $self->identifier( $msNameNode->textContent );
 382             } else {
 383                 # We have an msDesc but who knows what is in it?
 384                 my $desc = $descnode->textContent;
 385                 $desc =~ s/\n/ /gs;
 386                 $desc =~ s/\s+/ /g;
 387                 $self->identifier( $desc );
 388             }
 389         }
 390         if( $descnode->hasAttribute('xml:id') ) {
 391                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 392                 } elsif( !$self->has_sigil ) {
 393                         throw( ident => 'missing sigil',
 394                                    message => 'Could not find xml:id witness sigil' );
 395                 }
 396         } else {
 397             throw( ident => "bad source",
 398                    message => "Could not find manuscript description element in TEI header" );
 399         }
 400
 401         # Now get the words out.
 402         my @words;
 403         my @layerwords;  # if the witness has layers
 404         # First, make sure all the words are wrapped in tags.
 405         # TODO Make this not necessarily dependent upon whitespace...
 406         word_tag_wrap( $xmlobj );
 407         # Now go text hunting.
 408         my @textnodes;
 409         if( $self->use_text ) {
 410                 @textnodes = $xpc->findnodes( $self->use_text );
 411         } else {
 412                 # Use the first 'text' node in the document.
 413                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 414         }
 415         my $teitext = $textnodes[0];
 416         if( $teitext ) {
 417                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 418         } else {
 419             throw( ident => "bad source",
 420                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 421         }
 422
 423         my @text = map { $_->text } @words;
 424         my @layertext = map { $_->text } @layerwords;
 425         $self->path( \@words );
 426         $self->text( \@text );
 427         if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 428                 $self->uncorrected_path( \@layerwords );
 429                 $self->layertext( \@layertext );
 430         }
 431 }
 432
 433 sub _tokenize_text {
 434         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 435         # Strip out the words.
 436         my $xpc = _xpc_for_el( $teitext );
 437         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 438         foreach( @divs ) {
 439                 my $place_str;
 440                 if( my $n = $_->getAttribute( 'n' ) ) {
 441                         $place_str = '#DIV_' . $n . '#';
 442                 } else {
 443                         $place_str = '#DIV#';
 444                 }
 445                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 446         }  # foreach <div/>
 447
 448         # But maybe we don't have any divs.  Just paragraphs.
 449         unless( @divs ) {
 450                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 451         }
 452 }
 453
 454 sub _objectify_words {
 455         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 456
 457         my $xpc = _xpc_for_el( $element );
 458         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 459         my @pgraphs = $xpc->findnodes( $xpexpr );
 460     return () unless @pgraphs;
 461     # Set up an expression to look for words and segs
 462     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 463         foreach my $pg ( @pgraphs ) {
 464                 # If this paragraph is the descendant of a note element,
 465                 # skip it.
 466                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 467                 next if scalar @noop_container;
 468                 # Get the text of each node
 469                 my $first_word = 1;
 470                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 471                 # of it, if necessary.
 472                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 473                         my( $text, $uncorr ) = _get_word_strings( $c );
 474 #                       try {
 475 #                               ( $text, $uncorr ) = _get_word_object( $c );
 476 #                       } catch( Text::Tradition::Error $e
 477 #                                               where { $_->has_tag( 'lb' ) } ) {
 478 #                               next;
 479 #                       }
 480                         unless( defined $text || defined $uncorr ) {
 481                                 print STDERR "WARNING: no text in node " . $c->nodeName
 482                                         . "\n" unless $c->nodeName eq 'lb';
 483                                 next;
 484                         }
 485                         print STDERR "DEBUG: space found in element node "
 486                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 487
 488                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 489                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 490                                 $ctr++;
 491                         }
 492                         my $id = $self->sigil . 'r' . $ctr;
 493                         my( $word, $acword );
 494                         if( $text ) {
 495                                 $word = $self->tradition->collation->add_reading(
 496                                         { 'id' => $id, 'text' => $text });
 497                         }
 498                         if( $uncorr && $uncorr ne $text ) {
 499                                 $id .= '_ac';
 500                                 $acword = $self->tradition->collation->add_reading(
 501                                         { 'id' => $id, 'text' => $uncorr });
 502                         } elsif( $uncorr ) {
 503                                 $acword = $word;
 504                         }
 505
 506 #                       if( $first_word ) {
 507 #                               $first_word = 0;
 508 #                               # Set the relevant sectioning markers
 509 #                               if( $divmarker ) {
 510 #                                       $w->add_placeholder( $divmarker );
 511 #                                       $divmarker = undef;
 512 #                               }
 513 #                               $w->add_placeholder( '#PG#' );
 514 #                       }
 515                         push( @$wordlist, $word ) if $word;
 516                         push( @$uncorrlist, $acword ) if $acword;
 517                 }
 518     }
 519 }
 520
 521 # Given a word or segment node, make a Reading object for the word
 522 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 523 # 'corrected' state.
 524
 525 sub _get_word_strings {
 526         my( $node ) = @_;
 527         my( $text, $uncorrtext );
 528         # We can have an lb or pb in the middle of a word; if we do, the
 529         # whitespace (including \n) after the break becomes insignificant
 530         # and we want to nuke it.
 531         my $strip_leading_space = 0;
 532         my $word_excluded = 0;
 533         my $xpc = _xpc_for_el( $node );
 534         # TODO This does not cope with nested add/dels.
 535         my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
 536         my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 537         foreach my $c ($node->childNodes() ) {
 538                 if( $c->nodeName eq 'num'
 539                         && defined $c->getAttribute( 'value' ) ) {
 540                         # Push the number.
 541                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 542                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 543                         # If this is just after a line/page break, return to normal behavior.
 544                         $strip_leading_space = 0;
 545                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 546                         # Set a flag that strips leading whitespace until we
 547                         # get to the next bit of non-whitespace.
 548                         $strip_leading_space = 1;
 549                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 550                                   || $c->nodeName eq 'sic'
 551                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 552                                   || $c->textContent eq ''
 553                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 554                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 555                         next;
 556                 } elsif( $c->nodeName eq 'add' ) {
 557                         my( $use, $discard ) = _get_word_strings( $c );
 558                         $text .= $use;
 559                 } elsif( $c->nodeName eq 'del' ) {
 560                         my( $discard, $use ) = _get_word_strings( $c );
 561                         $uncorrtext .= $use;
 562                 } else {
 563                         my ( $tagtxt, $taguncorr );
 564                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 565                                 # A text node.
 566                                 $tagtxt = $c->textContent;
 567                                 $taguncorr = $c->textContent;
 568                         } else {
 569                                 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 570                         }
 571                         if( $strip_leading_space ) {
 572                                 $tagtxt =~ s/^[\s\n]+//s;
 573                                 $taguncorr =~ s/^[\s\n]+//s;
 574                                 # Unset the flag as soon as we see non-whitespace.
 575                                 $strip_leading_space = 0 if $tagtxt;
 576                         }
 577                         $text .= $tagtxt;
 578                         $uncorrtext .= $taguncorr;
 579                 }
 580         }
 581         throw( ident => "text not found",
 582                tags => [ $node->nodeName ],
 583                message => "No text found in node " . $node->toString(0) )
 584             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 585         return( $text, $uncorrtext );
 586 }
 587
 588 sub _split_words {
 589         my( $self, $string, $c ) = @_;
 590         my @raw_words = split( /\s+/, $string );
 591         my @words;
 592         foreach my $w ( @raw_words ) {
 593                 my $id = $self->sigil . 'r'. $c++;
 594                 my %opts = ( 'text' => $w, 'id' => $id );
 595                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 596                 # Skip any words that have been canonized out of existence.
 597                 next if( length( $w_obj->text ) == 0 );
 598                 push( @words, $w_obj );
 599         }
 600         return @words;
 601 }
 602
 603 sub _init_from_json {
 604         my( $self ) = shift;
 605         my $wit;
 606         if( $self->has_object ) {
 607                 $wit = $self->object;
 608         } elsif( $self->has_string ) {
 609                 $wit = from_json( $self->string );
 610         } elsif( $self->has_file ) {
 611         my $ok = open( INPUT, $self->file );
 612         unless( $ok ) {
 613                         throw( ident => "bad source",
 614                                    message => 'Could not open ' . $self->file . ' for reading' );
 615         }
 616         binmode( INPUT, ':encoding(UTF-8)' );
 617         my @lines = <INPUT>;
 618         close INPUT;
 619         $wit = from_json( join( '', @lines ) );
 620         }
 621
 622         if( exists $wit->{'id'} ) {
 623                 $self->_set_sigil( $wit->{'id'} );
 624         } elsif( !$self->has_sigil ) {
 625                 throw( ident => 'missing sigil',
 626                            message => 'Could not find witness sigil (id) in JSON spec' );
 627         }
 628         $self->identifier( $wit->{'name'} );
 629         my @words;
 630         my @layerwords;
 631         my( @text, @layertext );
 632         if( exists $wit->{'content'} ) {
 633                 # We need to tokenize the text ourselves.
 634                 @words = _split_words( $self, $wit->{'content'} );
 635         } elsif( exists $wit->{'tokens'} ) {
 636                 # We have a bunch of pretokenized words.
 637                 my $ctr = 0;
 638                 foreach my $token ( @{$wit->{'tokens'}} ) {
 639                         my $w_obj = $self->tradition->collation->add_reading({
 640                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 641                         push( @words, $w_obj );
 642                         push( @text, $token->{'t'} ); # TODO unless...?
 643                 }
 644                 ## TODO rethink this JSOn mechanism
 645                 if( exists $wit->{'layertokens'} ) {
 646                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 647                                 my $w_obj = $self->tradition->collation->add_reading({
 648                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 649                                 push( @layerwords, $w_obj );
 650                                 push( @layertext, $token->{'t'} );
 651                         }
 652                 }
 653         }
 654         $self->text( \@text );
 655         $self->layertext( \@layertext ) if @layertext;
 656         $self->path( \@words );
 657         $self->uncorrected_path( \@layerwords ) if @layerwords;
 658 }
 659
 660 sub _init_from_plaintext {
 661     my( $self ) = @_;
 662     my $str;
 663     if( $self->has_file ) {
 664         my $ok = open( INPUT, $self->file );
 665         unless( $ok ) {
 666                         throw( ident => "bad source",
 667                                    message => 'Could not open ' . $self->file . ' for reading' );
 668         }
 669         binmode( INPUT, ':encoding(UTF-8)' );
 670         my @lines = <INPUT>;
 671         close INPUT;
 672         $str = join( '', @lines );
 673     } elsif( $self->has_object ) { # ...seriously?
 674         $str = ${$self->object};
 675     } else {
 676         $str = $self->string;
 677     }
 678
 679     # TODO allow a different word separation expression
 680     my @text = split( /\s+/, $str );
 681     $self->text( \@text );
 682     my @words = _split_words( $self, $str );
 683         $self->path( \@words );
 684 }
 685
 686 sub throw {
 687         Text::Tradition::Error->throw(
 688                 'ident' => 'Witness parsing error',
 689                 'message' => $_[0],
 690                 );
 691 }
 692
 693 sub _xpc_for_el {
 694         my $el = shift;
 695         my $xpc = XML::LibXML::XPathContext->new( $el );
 696                 if( $el->namespaceURI ) {
 697                         $xpc->registerNs( 'tei', $el->namespaceURI );
 698                 }
 699         return $xpc;
 700 }
 701
 702 =head2 export_as_json
 703
 704 Exports the witness as a JSON structure, with the following keys:
 705
 706 =over 4
 707
 708 =item * id - The witness sigil
 709
 710 =item * name - The witness identifier
 711
 712 =item * tokens - An array of hashes of the form { "t":"WORD" }
 713
 714 =back
 715
 716 =begin testing
 717
 718 use Text::Tradition;
 719 my $trad = Text::Tradition->new();
 720
 721 my @text = qw/ Thhis is a line of text /;
 722 my $wit = $trad->add_witness(
 723     'sigil' => 'A',
 724     'string' => join( ' ', @text ),
 725     'sourcetype' => 'plaintext',
 726     'identifier' => 'test witness',
 727      );
 728 my $jsonstruct = $wit->export_as_json;
 729 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 730 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 731 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 732 foreach my $idx ( 0 .. $#text ) {
 733         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 734 }
 735
 736 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 737                                 has pierced unto the root );
 738 $trad = Text::Tradition->new(
 739         'input' => 'CollateX',
 740         'file' => 't/data/Collatex-16.xml' );
 741
 742 $jsonstruct = $trad->witness('A')->export_as_json;
 743 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 744 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 745 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 746 foreach my $idx ( 0 .. $#ctext ) {
 747         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 748 }
 749
 750 ## TODO test layertext export
 751
 752 =end testing
 753
 754 =cut
 755
 756 sub export_as_json {
 757         my $self = shift;
 758         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 759         my $obj =  {
 760                 'id' => $self->sigil,
 761                 'tokens' => \@wordlist,
 762                 'name' => $self->identifier,
 763         };
 764         if( $self->is_layered ) {
 765                 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
 766                 $obj->{'layertokens'} = \@lwlist;
 767         }
 768         return $obj;
 769 }
 770
 771 no Moose;
 772 __PACKAGE__->meta->make_immutable;
 773
 774 =head1 BUGS / TODO
 775
 776 =over
 777
 778 =item * Figure out how to serialize a witness
 779
 780 =item * Support encodings other than UTF-8
 781
 782 =back
 783
 784 =head1 LICENSE
 785
 786 This package is free software and is provided "as is" without express
 787 or implied warranty.  You can redistribute it and/or modify it under
 788 the same terms as Perl itself.
 789
 790 =head1 AUTHOR
 791
 792 Tara L Andrews E<lt>aurum@cpan.orgE<gt>