lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Moose::Util::TypeConstraints;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9 use XML::Easy::Syntax qw( $xml10_name_rx );
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Witness - a manuscript witness to a text tradition
  14
  15 =head1 SYNOPSIS
  16
  17   use Text::Tradition::Witness;
  18   my $w = Text::Tradition::Witness->new(
  19     'sigil' => 'A',
  20     'identifier' => 'Oxford MS Ex.1932',
  21     );
  22
  23 =head1 DESCRIPTION
  24
  25 Text::Tradition::Witness is an object representation of a manuscript
  26 witness to a text tradition.  A manuscript has a sigil (a short code that
  27 represents it in the wider tradition), an identifier (e.g. the library ID),
  28 and probably a text.
  29
  30 =head1 METHODS
  31
  32 =head2 new
  33
  34 Create a new witness.  Options include:
  35
  36 =over
  37
  38 =item * sigil - A short code to represent the manuscript.  Required.
  39
  40 =item * sourcetype - What sort of witness data this is. Options are
  41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  42 used by Collation parsers.)
  43
  44 =item * file
  45 =item * string
  46 =item * object
  47
  48 The data source for the witness.  Use the appropriate option.
  49
  50 =item * use_text - An initialization option.  If the witness is read from a
  51 TEI document and more than one <text/> tag exists therein, the default
  52 behavior is to use the first defined text.  If this is not desired,
  53 use_text should be set to an XPath expression that will select the correct
  54 text.
  55
  56 =item * language - The name of the applicable L<Text::Tradition::Lang>
  57 module for language handling. Usually inherited from the language set in
  58 the L<Text::Tradition> object, and defaults to Default.
  59
  60 =item * identifier - The recognized name of the manuscript, e.g. a library
  61 identifier. Taken from the msDesc element for a TEI file.
  62
  63 =item * other_info - A freeform string for any other description of the
  64 manuscript.
  65
  66 =back
  67
  68 =head2 sigil
  69
  70 The sigil by which to identify this manuscript, which must conform to the
  71 specification for XML attribute strings (broadly speaking, it must begin
  72 with a letter and can have only a few sorts of punctuation characters in
  73 it.)
  74
  75 =head2 identifier
  76
  77 A freeform name by which to identify the manuscript, which may be longer
  78 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  79 TEI msName attribute, or constructed from the settlement and idno if
  80 supplied.
  81
  82 =head2 settlement
  83
  84 The city, town, etc. where the manuscript is held. Will be read from the
  85 TEI msDesc element if supplied.
  86
  87 =head2 repository
  88
  89 The institution that holds the manuscript. Will be read from the TEI msDesc
  90 element if supplied.
  91
  92 =head2 idno
  93
  94 The identification or call number of the manuscript.  Will be read from the
  95 TEI msDesc element if supplied.
  96
  97 =head2 text
  98
  99 An array of strings (words) that contains the text of the
 100 manuscript.  This should not change after the witness has been
 101 instantiated, and the path through the collation should always match it.
 102
 103 =head2 layertext
 104
 105 An array of strings (words) that contains the layered
 106 text, if any, of the manuscript.  This should not change after the witness
 107 has been instantiated, and the path through the collation should always
 108 match it.
 109
 110 =head2 language
 111
 112 Accessor method to get the witness language.
 113
 114 =head2 identifier
 115
 116 Accessor method for the witness identifier.
 117
 118 =head2 other_info
 119
 120 Accessor method for the general witness description.
 121
 122 =head2 is_layered
 123
 124 Boolean method to note whether the witness has layers (e.g. pre-correction
 125 readings) in the collation.
 126
 127 =begin testing
 128
 129 use Text::Tradition;
 130 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 131 my $c = $trad->collation;
 132
 133 # Test a plaintext witness via string
 134 my $str = 'This is a line of text';
 135 my $ptwit = $trad->add_witness(
 136     'sigil' => 'A',
 137     'sourcetype' => 'plaintext',
 138     'string' => $str
 139      );
 140 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 141 if( $ptwit ) {
 142     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 143     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 144 }
 145
 146 # Test some JSON witnesses via object
 147 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 148 binmode( JSIN, ':encoding(UTF-8)' );
 149 my @lines = <JSIN>;
 150 close JSIN;
 151 $trad->add_json_witnesses( join( '', @lines ) );
 152 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 153         "Found first JSON witness" );
 154 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 155         "Found second JSON witness" );
 156
 157 # Test an XML witness via file
 158 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 159         'file' => 't/data/witnesses/teiwit.xml' );
 160 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 161 if( $xmlwit ) {
 162         is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 163         ok( $xmlwit->is_layered, "Picked up correction layer" );
 164         is( @{$xmlwit->text}, 182, "Got correct text length" );
 165         is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
 166 }
 167 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
 168 is( @allwitwords, 184, "Reused appropriate readings" );
 169
 170 ## Test use_text
 171 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 172         'file' => 't/data/witnesses/group.xml',
 173         'use_text' => '//tei:group/tei:text[2]' );
 174 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
 175 if( $xpwit ) {
 176         is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
 177         ok( !$xpwit->is_layered, "Picked up no correction layer" );
 178         is( @{$xpwit->text}, 157, "Got correct text length" );
 179 }
 180
 181
 182 =end testing
 183
 184 =cut
 185
 186 subtype 'SourceType',
 187         as 'Str',
 188         where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
 189         message { 'Source type must be one of xmldesc, plaintext, json, collation' };
 190
 191 subtype 'Sigil',
 192         as 'Str',
 193         where { $_ =~ /\A$xml10_name_rx\z/ },
 194         message { 'Sigil must be a valid XML attribute string' };
 195
 196 no Moose::Util::TypeConstraints;
 197
 198 has 'tradition' => (
 199         'is' => 'ro',
 200         'isa' => 'Text::Tradition',
 201         'required' => 1,
 202         );
 203
 204 # Sigil. Required identifier for a witness, but may be found inside
 205 # the XML file.
 206 has 'sigil' => (
 207         is => 'ro',
 208         isa => 'Sigil',
 209         predicate => 'has_sigil',
 210         writer => '_set_sigil',
 211         );
 212
 213 has 'language' => (
 214     is => 'ro',
 215     isa => 'Str',
 216     default => 'Default',
 217     );
 218
 219 # Other identifying information
 220 has 'identifier' => (
 221         is => 'rw',
 222         isa => 'Str',
 223         );
 224
 225 has 'settlement' => (
 226         is => 'rw',
 227         isa => 'Str',
 228         );
 229
 230 has 'repository' => (
 231         is => 'rw',
 232         isa => 'Str',
 233         );
 234
 235 has 'idno' => (
 236         is => 'rw',
 237         isa => 'Str',
 238         );
 239
 240 # Source. Can be XML obj, JSON data struct, or string.
 241 # Not used if the witness is created by parsing a collation.
 242 has 'sourcetype' => (
 243         is => 'ro',
 244         isa => 'SourceType',
 245         required => 1,
 246 );
 247
 248 has 'file' => (
 249         is => 'ro',
 250         isa => 'Str',
 251         predicate => 'has_file',
 252 );
 253
 254 has 'string' => (
 255         is => 'ro',
 256         isa => 'Str',
 257         predicate => 'has_string',
 258 );
 259
 260 has 'object' => ( # could be anything.
 261         is => 'ro',
 262         predicate => 'has_object',
 263         clearer => 'clear_object',
 264 );
 265
 266 # In the case of a TEI document with multiple texts, specify
 267 # which text is the root. Should be an XPath expression.
 268 has 'use_text' => (
 269         is => 'ro',
 270         isa => 'Str',
 271         );
 272
 273 # Text.  This is an array of strings (i.e. word tokens).
 274 # TODO Think about how to handle this for the case of pre-prepared
 275 # collations, where the tokens are in the graph already.
 276 has 'text' => (
 277         is => 'rw',
 278         isa => 'ArrayRef[Str]',
 279         predicate => 'has_text',
 280         );
 281
 282 has 'layertext' => (
 283         is => 'rw',
 284         isa => 'ArrayRef[Str]',
 285         predicate => 'has_layertext',
 286         );
 287
 288 # Path.  This is an array of Reading nodes that can be saved during
 289 # initialization, but should be cleared before saving in a DB.
 290 has 'path' => (
 291         is => 'rw',
 292         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 293         predicate => 'has_path',
 294         clearer => 'clear_path',
 295         );
 296
 297 ## TODO change the name of this
 298 has 'uncorrected_path' => (
 299         is => 'rw',
 300         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 301         clearer => 'clear_uncorrected_path',
 302         );
 303
 304 has 'is_layered' => (
 305         is => 'rw',
 306         isa => 'Bool',
 307         );
 308
 309 # If we set an uncorrected path, ever, remember that we did so.
 310 around 'uncorrected_path' => sub {
 311         my $orig = shift;
 312         my $self = shift;
 313
 314         $self->is_layered( 1 );
 315         $self->$orig( @_ );
 316 };
 317
 318 sub BUILD {
 319         my $self = shift;
 320         if( $self->has_source ) {
 321                 my $init_sub = '_init_from_' . $self->sourcetype;
 322                 $self->$init_sub();
 323                 # Remove our XML / source objects; we no longer need them.
 324                 $self->clear_object if $self->has_object;
 325                 $self->tradition->collation->make_witness_path( $self );
 326         }
 327         return $self;
 328 }
 329
 330 sub has_source {
 331         my $self = shift;
 332         return $self->has_file || $self->has_string || $self->has_object;
 333 }
 334
 335 sub _init_from_xmldesc {
 336         my $self = shift;
 337         my $xmlobj;
 338         if( $self->has_object ) {
 339                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 340                         throw( ident => "bad source",
 341                                    message => "Source object must be an XML::LibXML::Element (this is "
 342                                         . ref( $self->object ) . ");" );
 343                 }
 344                 $xmlobj = $self->object;
 345         } else {
 346                 my $parser = XML::LibXML->new();
 347                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 348                 try {
 349                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 350                 } catch( XML::LibXML::Error $e ) {
 351                         throw( ident => "bad source",
 352                                    message => "XML parsing error: " . $e->as_string );
 353                 }
 354         }
 355
 356         unless( $xmlobj->nodeName eq 'TEI' ) {
 357                 throw( ident => "bad source",
 358                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 359         }
 360
 361         # Set up the tags we need, with or without namespaces.
 362         map { $tags{$_} = "//$_" }
 363                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 364         # Set up our XPath object
 365         my $xpc = _xpc_for_el( $xmlobj );
 366         # Use namespace-aware tags if we have to
 367         if( $xmlobj->namespaceURI ) {
 368             map { $tags{$_} = "//tei:$_" } keys %tags;
 369         }
 370
 371         # Get the identifier
 372         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 373                 my $descnode = $desc->get_node(1);
 374                 # First try to use settlement/repository/idno.
 375                 my( $setNode, $reposNode, $idNode ) =
 376                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 377                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 378                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 379                 $self->settlement( $setNode ? $setNode->textContent : '' );
 380                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 381                 $self->idno( $idNode ? $idNode->textContent : '' );
 382                 if( $self->settlement && $self->idno ) {
 383                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 384                 } else {
 385                     # Look for an msName.
 386                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 387                     if( $msNameNode ) {
 388                 $self->identifier( $msNameNode->textContent );
 389             } else {
 390                 # We have an msDesc but who knows what is in it?
 391                 my $desc = $descnode->textContent;
 392                 $desc =~ s/\n/ /gs;
 393                 $desc =~ s/\s+/ /g;
 394                 $self->identifier( $desc );
 395             }
 396         }
 397         if( $descnode->hasAttribute('xml:id') ) {
 398                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 399                 } elsif( !$self->has_sigil ) {
 400                         throw( ident => 'missing sigil',
 401                                    message => 'Could not find xml:id witness sigil' );
 402                 }
 403         } else {
 404             throw( ident => "bad source",
 405                    message => "Could not find manuscript description element in TEI header" );
 406         }
 407
 408         # Now get the words out.
 409         my @words;
 410         my @layerwords;  # if the witness has layers
 411         # First, make sure all the words are wrapped in tags.
 412         # TODO Make this not necessarily dependent upon whitespace...
 413         word_tag_wrap( $xmlobj );
 414         # Now go text hunting.
 415         my @textnodes;
 416         if( $self->use_text ) {
 417                 @textnodes = $xpc->findnodes( $self->use_text );
 418         } else {
 419                 # Use the first 'text' node in the document.
 420                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 421         }
 422         my $teitext = $textnodes[0];
 423         if( $teitext ) {
 424                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 425         } else {
 426             throw( ident => "bad source",
 427                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 428         }
 429
 430         my @text = map { $_->text } @words;
 431         my @layertext = map { $_->text } @layerwords;
 432         $self->path( \@words );
 433         $self->text( \@text );
 434         if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 435                 $self->uncorrected_path( \@layerwords );
 436                 $self->layertext( \@layertext );
 437         }
 438 }
 439
 440 sub _tokenize_text {
 441         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 442         # Strip out the words.
 443         my $xpc = _xpc_for_el( $teitext );
 444         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 445         foreach( @divs ) {
 446                 my $place_str;
 447                 if( my $n = $_->getAttribute( 'n' ) ) {
 448                         $place_str = '#DIV_' . $n . '#';
 449                 } else {
 450                         $place_str = '#DIV#';
 451                 }
 452                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 453         }  # foreach <div/>
 454
 455         # But maybe we don't have any divs.  Just paragraphs.
 456         unless( @divs ) {
 457                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 458         }
 459 }
 460
 461 sub _objectify_words {
 462         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 463
 464         my $xpc = _xpc_for_el( $element );
 465         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 466         my @pgraphs = $xpc->findnodes( $xpexpr );
 467     return () unless @pgraphs;
 468     # Set up an expression to look for words and segs
 469     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 470         foreach my $pg ( @pgraphs ) {
 471                 # If this paragraph is the descendant of a note element,
 472                 # skip it.
 473                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 474                 next if scalar @noop_container;
 475                 # Get the text of each node
 476                 my $first_word = 1;
 477                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 478                 # of it, if necessary.
 479                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 480                         my( $text, $uncorr ) = _get_word_strings( $c );
 481 #                       try {
 482 #                               ( $text, $uncorr ) = _get_word_object( $c );
 483 #                       } catch( Text::Tradition::Error $e
 484 #                                               where { $_->has_tag( 'lb' ) } ) {
 485 #                               next;
 486 #                       }
 487                         unless( defined $text || defined $uncorr ) {
 488                                 print STDERR "WARNING: no text in node " . $c->nodeName
 489                                         . "\n" unless $c->nodeName eq 'lb';
 490                                 next;
 491                         }
 492                         print STDERR "DEBUG: space found in element node "
 493                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 494
 495                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 496                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 497                                 $ctr++;
 498                         }
 499                         my $id = $self->sigil . 'r' . $ctr;
 500                         my( $word, $acword );
 501                         if( $text ) {
 502                                 $word = $self->tradition->collation->add_reading(
 503                                         { 'id' => $id, 'text' => $text });
 504                         }
 505                         if( $uncorr && $uncorr ne $text ) {
 506                                 $id .= '_ac';
 507                                 $acword = $self->tradition->collation->add_reading(
 508                                         { 'id' => $id, 'text' => $uncorr });
 509                         } elsif( $uncorr ) {
 510                                 $acword = $word;
 511                         }
 512
 513 #                       if( $first_word ) {
 514 #                               $first_word = 0;
 515 #                               # Set the relevant sectioning markers
 516 #                               if( $divmarker ) {
 517 #                                       $w->add_placeholder( $divmarker );
 518 #                                       $divmarker = undef;
 519 #                               }
 520 #                               $w->add_placeholder( '#PG#' );
 521 #                       }
 522                         push( @$wordlist, $word ) if $word;
 523                         push( @$uncorrlist, $acword ) if $acword;
 524                 }
 525     }
 526 }
 527
 528 # Given a word or segment node, make a Reading object for the word
 529 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 530 # 'corrected' state.
 531
 532 sub _get_word_strings {
 533         my( $node ) = @_;
 534         my( $text, $uncorrtext );
 535         # We can have an lb or pb in the middle of a word; if we do, the
 536         # whitespace (including \n) after the break becomes insignificant
 537         # and we want to nuke it.
 538         my $strip_leading_space = 0;
 539         my $word_excluded = 0;
 540         my $xpc = _xpc_for_el( $node );
 541         # TODO This does not cope with nested add/dels.
 542         my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
 543         my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 544         foreach my $c ($node->childNodes() ) {
 545                 if( $c->nodeName eq 'num'
 546                         && defined $c->getAttribute( 'value' ) ) {
 547                         # Push the number.
 548                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 549                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 550                         # If this is just after a line/page break, return to normal behavior.
 551                         $strip_leading_space = 0;
 552                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 553                         # Set a flag that strips leading whitespace until we
 554                         # get to the next bit of non-whitespace.
 555                         $strip_leading_space = 1;
 556                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 557                                   || $c->nodeName eq 'sic'
 558                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 559                                   || $c->textContent eq ''
 560                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 561                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 562                         next;
 563                 } elsif( $c->nodeName eq 'add' ) {
 564                         my( $use, $discard ) = _get_word_strings( $c );
 565                         $text .= $use;
 566                 } elsif( $c->nodeName eq 'del' ) {
 567                         my( $discard, $use ) = _get_word_strings( $c );
 568                         $uncorrtext .= $use;
 569                 } else {
 570                         my ( $tagtxt, $taguncorr );
 571                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 572                                 # A text node.
 573                                 $tagtxt = $c->textContent;
 574                                 $taguncorr = $c->textContent;
 575                         } else {
 576                                 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 577                         }
 578                         if( $strip_leading_space ) {
 579                                 $tagtxt =~ s/^[\s\n]+//s;
 580                                 $taguncorr =~ s/^[\s\n]+//s;
 581                                 # Unset the flag as soon as we see non-whitespace.
 582                                 $strip_leading_space = 0 if $tagtxt;
 583                         }
 584                         $text .= $tagtxt;
 585                         $uncorrtext .= $taguncorr;
 586                 }
 587         }
 588         throw( ident => "text not found",
 589                tags => [ $node->nodeName ],
 590                message => "No text found in node " . $node->toString(0) )
 591             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 592         return( $text, $uncorrtext );
 593 }
 594
 595 sub _split_words {
 596         my( $self, $string, $c ) = @_;
 597         my @raw_words = split( /\s+/, $string );
 598         my @words;
 599         foreach my $w ( @raw_words ) {
 600                 my $id = $self->sigil . 'r'. $c++;
 601                 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
 602                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 603                 # Skip any words that have been canonized out of existence.
 604                 next if( length( $w_obj->text ) == 0 );
 605                 push( @words, $w_obj );
 606         }
 607         return @words;
 608 }
 609
 610 sub _init_from_json {
 611         my( $self ) = shift;
 612         my $wit;
 613         if( $self->has_object ) {
 614                 $wit = $self->object;
 615         } elsif( $self->has_string ) {
 616                 $wit = from_json( $self->string );
 617         } elsif( $self->has_file ) {
 618         my $ok = open( INPUT, $self->file );
 619         unless( $ok ) {
 620                         throw( ident => "bad source",
 621                                    message => 'Could not open ' . $self->file . ' for reading' );
 622         }
 623         binmode( INPUT, ':encoding(UTF-8)' );
 624         my @lines = <INPUT>;
 625         close INPUT;
 626         $wit = from_json( join( '', @lines ) );
 627         }
 628
 629         if( exists $wit->{'id'} ) {
 630                 $self->_set_sigil( $wit->{'id'} );
 631         } elsif( !$self->has_sigil ) {
 632                 throw( ident => 'missing sigil',
 633                            message => 'Could not find witness sigil (id) in JSON spec' );
 634         }
 635         $self->identifier( $wit->{'name'} );
 636         my @words;
 637         my @layerwords;
 638         my( @text, @layertext );
 639         if( exists $wit->{'content'} ) {
 640                 # We need to tokenize the text ourselves.
 641                 @words = _split_words( $self, $wit->{'content'} );
 642         } elsif( exists $wit->{'tokens'} ) {
 643                 # We have a bunch of pretokenized words.
 644                 my $ctr = 0;
 645                 foreach my $token ( @{$wit->{'tokens'}} ) {
 646                         my $w_obj = $self->tradition->collation->add_reading({
 647                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 648                         push( @words, $w_obj );
 649                         push( @text, $token->{'t'} ); # TODO unless...?
 650                 }
 651                 ## TODO rethink this JSOn mechanism
 652                 if( exists $wit->{'layertokens'} ) {
 653                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 654                                 my $w_obj = $self->tradition->collation->add_reading({
 655                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 656                                 push( @layerwords, $w_obj );
 657                                 push( @layertext, $token->{'t'} );
 658                         }
 659                 }
 660         }
 661         $self->text( \@text );
 662         $self->layertext( \@layertext ) if @layertext;
 663         $self->path( \@words );
 664         $self->uncorrected_path( \@layerwords ) if @layerwords;
 665 }
 666
 667 sub _init_from_plaintext {
 668     my( $self ) = @_;
 669     my $str;
 670     if( $self->has_file ) {
 671         my $ok = open( INPUT, $self->file );
 672         unless( $ok ) {
 673                         throw( ident => "bad source",
 674                                    message => 'Could not open ' . $self->file . ' for reading' );
 675         }
 676         binmode( INPUT, ':encoding(UTF-8)' );
 677         my @lines = <INPUT>;
 678         close INPUT;
 679         $str = join( '', @lines );
 680     } elsif( $self->has_object ) { # ...seriously?
 681         $str = ${$self->object};
 682     } else {
 683         $str = $self->string;
 684     }
 685
 686     # TODO allow a different word separation expression
 687     my @text = split( /\s+/, $str );
 688     $self->text( \@text );
 689     my @words = _split_words( $self, $str );
 690         $self->path( \@words );
 691 }
 692
 693 sub throw {
 694         Text::Tradition::Error->throw(
 695                 'ident' => 'Witness parsing error',
 696                 'message' => $_[0],
 697                 );
 698 }
 699
 700 sub _xpc_for_el {
 701         my $el = shift;
 702         my $xpc = XML::LibXML::XPathContext->new( $el );
 703                 if( $el->namespaceURI ) {
 704                         $xpc->registerNs( 'tei', $el->namespaceURI );
 705                 }
 706         return $xpc;
 707 }
 708
 709 =head2 export_as_json
 710
 711 Exports the witness as a JSON structure, with the following keys:
 712
 713 =over 4
 714
 715 =item * id - The witness sigil
 716
 717 =item * name - The witness identifier
 718
 719 =item * tokens - An array of hashes of the form { "t":"WORD" }
 720
 721 =back
 722
 723 =begin testing
 724
 725 use Text::Tradition;
 726 my $trad = Text::Tradition->new();
 727
 728 my @text = qw/ Thhis is a line of text /;
 729 my $wit = $trad->add_witness(
 730     'sigil' => 'A',
 731     'string' => join( ' ', @text ),
 732     'sourcetype' => 'plaintext',
 733     'identifier' => 'test witness',
 734      );
 735 my $jsonstruct = $wit->export_as_json;
 736 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 737 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 738 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 739 foreach my $idx ( 0 .. $#text ) {
 740         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 741 }
 742
 743 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 744                                 has pierced unto the root );
 745 $trad = Text::Tradition->new(
 746         'input' => 'CollateX',
 747         'file' => 't/data/Collatex-16.xml' );
 748
 749 $jsonstruct = $trad->witness('A')->export_as_json;
 750 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 751 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 752 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 753 foreach my $idx ( 0 .. $#ctext ) {
 754         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 755 }
 756
 757 ## TODO test layertext export
 758
 759 =end testing
 760
 761 =cut
 762
 763 sub export_as_json {
 764         my $self = shift;
 765         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 766         my $obj =  {
 767                 'id' => $self->sigil,
 768                 'tokens' => \@wordlist,
 769                 'name' => $self->identifier,
 770         };
 771         if( $self->is_layered ) {
 772                 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
 773                 $obj->{'layertokens'} = \@lwlist;
 774         }
 775         return $obj;
 776 }
 777
 778 no Moose;
 779 __PACKAGE__->meta->make_immutable;
 780
 781 =head1 BUGS / TODO
 782
 783 =over
 784
 785 =item * Support encodings other than UTF-8
 786
 787 =back
 788
 789 =head1 LICENSE
 790
 791 This package is free software and is provided "as is" without express
 792 or implied warranty.  You can redistribute it and/or modify it under
 793 the same terms as Perl itself.
 794
 795 =head1 AUTHOR
 796
 797 Tara L Andrews E<lt>aurum@cpan.orgE<gt>