lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Moose::Util::TypeConstraints;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9 use XML::Easy::Syntax qw( $xml10_name_rx );
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Witness - a manuscript witness to a text tradition
  14
  15 =head1 SYNOPSIS
  16
  17   use Text::Tradition::Witness;
  18   my $w = Text::Tradition::Witness->new(
  19     'sigil' => 'A',
  20     'identifier' => 'Oxford MS Ex.1932',
  21     );
  22
  23 =head1 DESCRIPTION
  24
  25 Text::Tradition::Witness is an object representation of a manuscript
  26 witness to a text tradition.  A manuscript has a sigil (a short code that
  27 represents it in the wider tradition), an identifier (e.g. the library ID),
  28 and probably a text.
  29
  30 =head1 METHODS
  31
  32 =head2 new
  33
  34 Create a new witness.  Options include:
  35
  36 =over
  37
  38 =item * sigil - A short code to represent the manuscript.  Required.
  39
  40 =item * sourcetype - What sort of witness data this is. Options are
  41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  42 used by Collation parsers.)
  43
  44 =item * file
  45 =item * string
  46 =item * object
  47
  48 The data source for the witness.  Use the appropriate option.
  49
  50 =item * use_text - An initialization option.  If the witness is read from a
  51 TEI document and more than one <text/> tag exists therein, the default
  52 behavior is to use the first defined text.  If this is not desired,
  53 use_text should be set to an XPath expression that will select the correct
  54 text.
  55
  56 =item * language - The name of the applicable L<Text::Tradition::Lang>
  57 module for language handling. Usually inherited from the language set in
  58 the L<Text::Tradition> object, and defaults to Default.
  59
  60 =item * identifier - The recognized name of the manuscript, e.g. a library
  61 identifier. Taken from the msDesc element for a TEI file.
  62
  63 =item * other_info - A freeform string for any other description of the
  64 manuscript.
  65
  66 =back
  67
  68 =head2 sigil
  69
  70 The sigil by which to identify this manuscript, which must conform to the
  71 specification for XML attribute strings (broadly speaking, it must begin
  72 with a letter and can have only a few sorts of punctuation characters in
  73 it.)
  74
  75 =head2 identifier
  76
  77 A freeform name by which to identify the manuscript, which may be longer
  78 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  79 TEI msName attribute, or constructed from the settlement and idno if
  80 supplied.
  81
  82 =head2 settlement
  83
  84 The city, town, etc. where the manuscript is held. Will be read from the
  85 TEI msDesc element if supplied.
  86
  87 =head2 repository
  88
  89 The institution that holds the manuscript. Will be read from the TEI msDesc
  90 element if supplied.
  91
  92 =head2 idno
  93
  94 The identification or call number of the manuscript.  Will be read from the
  95 TEI msDesc element if supplied.
  96
  97 =head2 text
  98
  99 An array of strings (words) that contains the text of the
 100 manuscript.  This should not change after the witness has been
 101 instantiated, and the path through the collation should always match it.
 102
 103 =head2 layertext
 104
 105 An array of strings (words) that contains the layered
 106 text, if any, of the manuscript.  This should not change after the witness
 107 has been instantiated, and the path through the collation should always
 108 match it.
 109
 110 =head2 language
 111
 112 Accessor method to get the witness language.
 113
 114 =head2 identifier
 115
 116 Accessor method for the witness identifier.
 117
 118 =head2 other_info
 119
 120 Accessor method for the general witness description.
 121
 122 =head2 has_source
 123
 124 Boolean method that returns a true value if the witness was created with a
 125 data source (that is, a file, string, or object to be parsed).
 126
 127 =head2 is_layered
 128
 129 Boolean method to note whether the witness has layers (e.g. pre-correction
 130 readings) in the collation.
 131
 132 =begin testing
 133
 134 use Text::Tradition;
 135 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 136 my $c = $trad->collation;
 137
 138 # Test a plaintext witness via string
 139 my $str = 'This is a line of text';
 140 my $ptwit = $trad->add_witness(
 141     'sigil' => 'A',
 142     'sourcetype' => 'plaintext',
 143     'string' => $str
 144      );
 145 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 146 if( $ptwit ) {
 147     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 148     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 149 }
 150
 151 # Test some JSON witnesses via object
 152 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 153 binmode( JSIN, ':encoding(UTF-8)' );
 154 my @lines = <JSIN>;
 155 close JSIN;
 156 $trad->add_json_witnesses( join( '', @lines ) );
 157 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 158         "Found first JSON witness" );
 159 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 160         "Found second JSON witness" );
 161
 162 # Test an XML witness via file
 163 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 164         'file' => 't/data/witnesses/teiwit.xml' );
 165 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 166 if( $xmlwit ) {
 167         is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 168         ok( $xmlwit->is_layered, "Picked up correction layer" );
 169         is( @{$xmlwit->text}, 182, "Got correct text length" );
 170         is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
 171 }
 172 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
 173 is( @allwitwords, 184, "Reused appropriate readings" );
 174
 175 ## Test use_text
 176 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 177         'file' => 't/data/witnesses/group.xml',
 178         'use_text' => '//tei:group/tei:text[2]' );
 179 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
 180 if( $xpwit ) {
 181         is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
 182         ok( !$xpwit->is_layered, "Picked up no correction layer" );
 183         is( @{$xpwit->text}, 157, "Got correct text length" );
 184 }
 185
 186
 187 =end testing
 188
 189 =cut
 190
 191 subtype 'SourceType',
 192         as 'Str',
 193         where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
 194         message { 'Source type must be one of xmldesc, plaintext, json, collation' };
 195
 196 subtype 'Sigil',
 197         as 'Str',
 198         where { $_ =~ /\A$xml10_name_rx\z/ },
 199         message { 'Sigil must be a valid XML attribute string' };
 200
 201 no Moose::Util::TypeConstraints;
 202
 203 has 'tradition' => (
 204         'is' => 'ro',
 205         'isa' => 'Text::Tradition',
 206         'required' => 1,
 207         weak_ref => 1
 208         );
 209
 210 # Sigil. Required identifier for a witness, but may be found inside
 211 # the XML file.
 212 has 'sigil' => (
 213         is => 'ro',
 214         isa => 'Sigil',
 215         predicate => 'has_sigil',
 216         writer => '_set_sigil',
 217         );
 218
 219 has 'language' => (
 220     is => 'ro',
 221     isa => 'Str',
 222     default => 'Default',
 223     );
 224
 225 # Other identifying information
 226 has 'identifier' => (
 227         is => 'rw',
 228         isa => 'Str',
 229         );
 230
 231 has 'settlement' => (
 232         is => 'rw',
 233         isa => 'Str',
 234         );
 235
 236 has 'repository' => (
 237         is => 'rw',
 238         isa => 'Str',
 239         );
 240
 241 has 'idno' => (
 242         is => 'rw',
 243         isa => 'Str',
 244         );
 245
 246 # Source. Can be XML obj, JSON data struct, or string.
 247 # Not used if the witness is created by parsing a collation.
 248 has 'sourcetype' => (
 249         is => 'ro',
 250         isa => 'SourceType',
 251         required => 1,
 252 );
 253
 254 has 'file' => (
 255         is => 'ro',
 256         isa => 'Str',
 257         predicate => 'has_file',
 258 );
 259
 260 has 'string' => (
 261         is => 'ro',
 262         isa => 'Str',
 263         predicate => 'has_string',
 264 );
 265
 266 has 'object' => ( # could be anything.
 267         is => 'ro',
 268         predicate => 'has_object',
 269         clearer => 'clear_object',
 270 );
 271
 272 # In the case of a TEI document with multiple texts, specify
 273 # which text is the root. Should be an XPath expression.
 274 has 'use_text' => (
 275         is => 'ro',
 276         isa => 'Str',
 277         );
 278
 279 # Text.  This is an array of strings (i.e. word tokens).
 280 # TODO Think about how to handle this for the case of pre-prepared
 281 # collations, where the tokens are in the graph already.
 282 has 'text' => (
 283         is => 'rw',
 284         isa => 'ArrayRef[Str]',
 285         predicate => 'has_text',
 286         );
 287
 288 has 'layertext' => (
 289         is => 'rw',
 290         isa => 'ArrayRef[Str]',
 291         predicate => 'has_layertext',
 292         );
 293
 294 # Path.  This is an array of Reading nodes that can be saved during
 295 # initialization, but should be cleared before saving in a DB.
 296 has 'path' => (
 297         is => 'rw',
 298         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 299         predicate => 'has_path',
 300         clearer => 'clear_path',
 301         );
 302
 303 ## TODO change the name of this
 304 has 'uncorrected_path' => (
 305         is => 'rw',
 306         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 307         clearer => 'clear_uncorrected_path',
 308         );
 309
 310 has 'is_layered' => (
 311         is => 'rw',
 312         isa => 'Bool',
 313         );
 314
 315 # If we set an uncorrected path, ever, remember that we did so.
 316 around 'uncorrected_path' => sub {
 317         my $orig = shift;
 318         my $self = shift;
 319
 320         $self->is_layered( 1 );
 321         $self->$orig( @_ );
 322 };
 323
 324 sub BUILD {
 325         my $self = shift;
 326         if( $self->has_source ) {
 327                 my $init_sub = '_init_from_' . $self->sourcetype;
 328                 $self->$init_sub();
 329                 # Remove our XML / source objects; we no longer need them.
 330                 $self->clear_object if $self->has_object;
 331                 $self->tradition->collation->make_witness_path( $self );
 332         }
 333         return $self;
 334 }
 335
 336 sub has_source {
 337         my $self = shift;
 338         return $self->has_file || $self->has_string || $self->has_object;
 339 }
 340
 341 sub _init_from_xmldesc {
 342         my $self = shift;
 343         my $xmlobj;
 344         if( $self->has_object ) {
 345                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 346                         throw( ident => "bad source",
 347                                    message => "Source object must be an XML::LibXML::Element (this is "
 348                                         . ref( $self->object ) . ");" );
 349                 }
 350                 $xmlobj = $self->object;
 351         } else {
 352                 require XML::LibXML;
 353                 my $parser = XML::LibXML->new();
 354                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 355                 try {
 356                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 357                 } catch( XML::LibXML::Error $e ) {
 358                         throw( ident => "bad source",
 359                                    message => "XML parsing error: " . $e->as_string );
 360                 }
 361         }
 362
 363         unless( $xmlobj->nodeName eq 'TEI' ) {
 364                 throw( ident => "bad source",
 365                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 366         }
 367
 368         # Set up the tags we need, with or without namespaces.
 369         map { $tags{$_} = "//$_" }
 370                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 371         # Set up our XPath object
 372         my $xpc = _xpc_for_el( $xmlobj );
 373         # Use namespace-aware tags if we have to
 374         if( $xmlobj->namespaceURI ) {
 375             map { $tags{$_} = "//tei:$_" } keys %tags;
 376         }
 377
 378         # Get the identifier
 379         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 380                 my $descnode = $desc->get_node(1);
 381                 # First try to use settlement/repository/idno.
 382                 my( $setNode, $reposNode, $idNode ) =
 383                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 384                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 385                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 386                 $self->settlement( $setNode ? $setNode->textContent : '' );
 387                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 388                 $self->idno( $idNode ? $idNode->textContent : '' );
 389                 if( $self->settlement && $self->idno ) {
 390                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 391                 } else {
 392                     # Look for an msName.
 393                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 394                     if( $msNameNode ) {
 395                 $self->identifier( $msNameNode->textContent );
 396             } else {
 397                 # We have an msDesc but who knows what is in it?
 398                 my $desc = $descnode->textContent;
 399                 $desc =~ s/\n/ /gs;
 400                 $desc =~ s/\s+/ /g;
 401                 $self->identifier( $desc );
 402             }
 403         }
 404         if( $descnode->hasAttribute('xml:id') ) {
 405                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 406                 } elsif( !$self->has_sigil ) {
 407                         throw( ident => 'missing sigil',
 408                                    message => 'Could not find xml:id witness sigil' );
 409                 }
 410         } else {
 411             throw( ident => "bad source",
 412                    message => "Could not find manuscript description element in TEI header" );
 413         }
 414
 415         # Now get the words out.
 416         my @words;
 417         my @layerwords;  # if the witness has layers
 418         # First, make sure all the words are wrapped in tags.
 419         # TODO Make this not necessarily dependent upon whitespace...
 420         word_tag_wrap( $xmlobj );
 421         # Now go text hunting.
 422         my @textnodes;
 423         if( $self->use_text ) {
 424                 @textnodes = $xpc->findnodes( $self->use_text );
 425         } else {
 426                 # Use the first 'text' node in the document.
 427                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 428         }
 429         my $teitext = $textnodes[0];
 430         if( $teitext ) {
 431                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 432         } else {
 433             throw( ident => "bad source",
 434                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 435         }
 436
 437         my @text = map { $_->text } @words;
 438         my @layertext = map { $_->text } @layerwords;
 439         $self->path( \@words );
 440         $self->text( \@text );
 441         if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 442                 $self->uncorrected_path( \@layerwords );
 443                 $self->layertext( \@layertext );
 444         }
 445 }
 446
 447 sub _tokenize_text {
 448         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 449         # Strip out the words.
 450         my $xpc = _xpc_for_el( $teitext );
 451         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 452         foreach( @divs ) {
 453                 my $place_str;
 454                 if( my $n = $_->getAttribute( 'n' ) ) {
 455                         $place_str = '#DIV_' . $n . '#';
 456                 } else {
 457                         $place_str = '#DIV#';
 458                 }
 459                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 460         }  # foreach <div/>
 461
 462         # But maybe we don't have any divs.  Just paragraphs.
 463         unless( @divs ) {
 464                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 465         }
 466 }
 467
 468 sub _objectify_words {
 469         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 470
 471         my $xpc = _xpc_for_el( $element );
 472         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 473         my @pgraphs = $xpc->findnodes( $xpexpr );
 474     return () unless @pgraphs;
 475     # Set up an expression to look for words and segs
 476     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 477         foreach my $pg ( @pgraphs ) {
 478                 # If this paragraph is the descendant of a note element,
 479                 # skip it.
 480                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 481                 next if scalar @noop_container;
 482                 # Get the text of each node
 483                 my $first_word = 1;
 484                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 485                 # of it, if necessary.
 486                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 487                         my( $text, $uncorr ) = _get_word_strings( $c );
 488 #                       try {
 489 #                               ( $text, $uncorr ) = _get_word_object( $c );
 490 #                       } catch( Text::Tradition::Error $e
 491 #                                               where { $_->has_tag( 'lb' ) } ) {
 492 #                               next;
 493 #                       }
 494                         unless( defined $text || defined $uncorr ) {
 495                                 print STDERR "WARNING: no text in node " . $c->nodeName
 496                                         . "\n" unless $c->nodeName eq 'lb';
 497                                 next;
 498                         }
 499                         print STDERR "DEBUG: space found in element node "
 500                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 501
 502                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 503                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 504                                 $ctr++;
 505                         }
 506                         my $id = $self->sigil . 'r' . $ctr;
 507                         my( $word, $acword );
 508                         if( $text ) {
 509                                 $word = $self->tradition->collation->add_reading(
 510                                         { 'id' => $id, 'text' => $text });
 511                         }
 512                         if( $uncorr && $uncorr ne $text ) {
 513                                 $id .= '_ac';
 514                                 $acword = $self->tradition->collation->add_reading(
 515                                         { 'id' => $id, 'text' => $uncorr });
 516                         } elsif( $uncorr ) {
 517                                 $acword = $word;
 518                         }
 519
 520 #                       if( $first_word ) {
 521 #                               $first_word = 0;
 522 #                               # Set the relevant sectioning markers
 523 #                               if( $divmarker ) {
 524 #                                       $w->add_placeholder( $divmarker );
 525 #                                       $divmarker = undef;
 526 #                               }
 527 #                               $w->add_placeholder( '#PG#' );
 528 #                       }
 529                         push( @$wordlist, $word ) if $word;
 530                         push( @$uncorrlist, $acword ) if $acword;
 531                 }
 532     }
 533 }
 534
 535 # Given a word or segment node, make a Reading object for the word
 536 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 537 # 'corrected' state.
 538
 539 sub _get_word_strings {
 540         my( $node ) = @_;
 541         my( $text, $uncorrtext );
 542         # We can have an lb or pb in the middle of a word; if we do, the
 543         # whitespace (including \n) after the break becomes insignificant
 544         # and we want to nuke it.
 545         my $strip_leading_space = 0;
 546         my $word_excluded = 0;
 547         my $xpc = _xpc_for_el( $node );
 548         # TODO This does not cope with nested add/dels.
 549         my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
 550         my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 551         foreach my $c ($node->childNodes() ) {
 552                 if( $c->nodeName eq 'num'
 553                         && defined $c->getAttribute( 'value' ) ) {
 554                         # Push the number.
 555                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 556                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 557                         # If this is just after a line/page break, return to normal behavior.
 558                         $strip_leading_space = 0;
 559                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 560                         # Set a flag that strips leading whitespace until we
 561                         # get to the next bit of non-whitespace.
 562                         $strip_leading_space = 1;
 563                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 564                                   || $c->nodeName eq 'sic'
 565                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 566                                   || $c->textContent eq ''
 567                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 568                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 569                         next;
 570                 } elsif( $c->nodeName eq 'add' ) {
 571                         my( $use, $discard ) = _get_word_strings( $c );
 572                         $text .= $use;
 573                 } elsif( $c->nodeName eq 'del' ) {
 574                         my( $discard, $use ) = _get_word_strings( $c );
 575                         $uncorrtext .= $use;
 576                 } else {
 577                         my ( $tagtxt, $taguncorr );
 578                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 579                                 # A text node.
 580                                 $tagtxt = $c->textContent;
 581                                 $taguncorr = $c->textContent;
 582                         } else {
 583                                 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 584                         }
 585                         if( $strip_leading_space ) {
 586                                 $tagtxt =~ s/^[\s\n]+//s;
 587                                 $taguncorr =~ s/^[\s\n]+//s;
 588                                 # Unset the flag as soon as we see non-whitespace.
 589                                 $strip_leading_space = 0 if $tagtxt;
 590                         }
 591                         $text .= $tagtxt;
 592                         $uncorrtext .= $taguncorr;
 593                 }
 594         }
 595         throw( ident => "text not found",
 596                tags => [ $node->nodeName ],
 597                message => "No text found in node " . $node->toString(0) )
 598             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 599         return( $text, $uncorrtext );
 600 }
 601
 602 sub _split_words {
 603         my( $self, $string, $c ) = @_;
 604         my @raw_words = split( /\s+/, $string );
 605         my @words;
 606         foreach my $w ( @raw_words ) {
 607                 my $id = $self->sigil . 'r'. $c++;
 608                 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
 609                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 610                 # Skip any words that have been canonized out of existence.
 611                 next if( length( $w_obj->text ) == 0 );
 612                 push( @words, $w_obj );
 613         }
 614         return @words;
 615 }
 616
 617 sub _init_from_json {
 618         my( $self ) = shift;
 619         my $wit;
 620         if( $self->has_object ) {
 621                 $wit = $self->object;
 622         } elsif( $self->has_string ) {
 623                 $wit = from_json( $self->string );
 624         } elsif( $self->has_file ) {
 625         my $ok = open( INPUT, $self->file );
 626         unless( $ok ) {
 627                         throw( ident => "bad source",
 628                                    message => 'Could not open ' . $self->file . ' for reading' );
 629         }
 630         binmode( INPUT, ':encoding(UTF-8)' );
 631         my @lines = <INPUT>;
 632         close INPUT;
 633         $wit = from_json( join( '', @lines ) );
 634         }
 635
 636         if( exists $wit->{'id'} ) {
 637                 $self->_set_sigil( $wit->{'id'} );
 638         } elsif( !$self->has_sigil ) {
 639                 throw( ident => 'missing sigil',
 640                            message => 'Could not find witness sigil (id) in JSON spec' );
 641         }
 642         $self->identifier( $wit->{'name'} );
 643         my @words;
 644         my @layerwords;
 645         my( @text, @layertext );
 646         if( exists $wit->{'content'} ) {
 647                 # We need to tokenize the text ourselves.
 648                 @words = _split_words( $self, $wit->{'content'} );
 649         } elsif( exists $wit->{'tokens'} ) {
 650                 # We have a bunch of pretokenized words.
 651                 my $ctr = 0;
 652                 foreach my $token ( @{$wit->{'tokens'}} ) {
 653                         my $w_obj = $self->tradition->collation->add_reading({
 654                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 655                         push( @words, $w_obj );
 656                         push( @text, $token->{'t'} ); # TODO unless...?
 657                 }
 658                 ## TODO rethink this JSOn mechanism
 659                 if( exists $wit->{'layertokens'} ) {
 660                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 661                                 my $w_obj = $self->tradition->collation->add_reading({
 662                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 663                                 push( @layerwords, $w_obj );
 664                                 push( @layertext, $token->{'t'} );
 665                         }
 666                 }
 667         }
 668         $self->text( \@text );
 669         $self->layertext( \@layertext ) if @layertext;
 670         $self->path( \@words );
 671         $self->uncorrected_path( \@layerwords ) if @layerwords;
 672 }
 673
 674 sub _init_from_plaintext {
 675     my( $self ) = @_;
 676     my $str;
 677     if( $self->has_file ) {
 678         my $ok = open( INPUT, $self->file );
 679         unless( $ok ) {
 680                         throw( ident => "bad source",
 681                                    message => 'Could not open ' . $self->file . ' for reading' );
 682         }
 683         binmode( INPUT, ':encoding(UTF-8)' );
 684         my @lines = <INPUT>;
 685         close INPUT;
 686         $str = join( '', @lines );
 687     } elsif( $self->has_object ) { # ...seriously?
 688         $str = ${$self->object};
 689     } else {
 690         $str = $self->string;
 691     }
 692
 693     # TODO allow a different word separation expression
 694     my @text = split( /\s+/, $str );
 695     $self->text( \@text );
 696     my @words = _split_words( $self, $str );
 697         $self->path( \@words );
 698 }
 699
 700 sub throw {
 701         Text::Tradition::Error->throw(
 702                 'ident' => 'Witness parsing error',
 703                 'message' => $_[0],
 704                 );
 705 }
 706
 707 sub _xpc_for_el {
 708         my $el = shift;
 709         my $xpc = XML::LibXML::XPathContext->new( $el );
 710                 if( $el->namespaceURI ) {
 711                         $xpc->registerNs( 'tei', $el->namespaceURI );
 712                 }
 713         return $xpc;
 714 }
 715
 716 =head2 export_as_json
 717
 718 Exports the witness as a JSON structure, with the following keys:
 719
 720 =over 4
 721
 722 =item * id - The witness sigil
 723
 724 =item * name - The witness identifier
 725
 726 =item * tokens - An array of hashes of the form { "t":"WORD" }
 727
 728 =back
 729
 730 =begin testing
 731
 732 use Text::Tradition;
 733 my $trad = Text::Tradition->new();
 734
 735 my @text = qw/ Thhis is a line of text /;
 736 my $wit = $trad->add_witness(
 737     'sigil' => 'A',
 738     'string' => join( ' ', @text ),
 739     'sourcetype' => 'plaintext',
 740     'identifier' => 'test witness',
 741      );
 742 my $jsonstruct = $wit->export_as_json;
 743 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 744 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 745 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 746 foreach my $idx ( 0 .. $#text ) {
 747         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 748 }
 749
 750 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 751                                 has pierced unto the root );
 752 $trad = Text::Tradition->new(
 753         'input' => 'CollateX',
 754         'file' => 't/data/Collatex-16.xml' );
 755
 756 $jsonstruct = $trad->witness('A')->export_as_json;
 757 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 758 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 759 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 760 foreach my $idx ( 0 .. $#ctext ) {
 761         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 762 }
 763
 764 ## TODO test layertext export
 765
 766 =end testing
 767
 768 =cut
 769
 770 sub export_as_json {
 771         my $self = shift;
 772         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 773         my $obj =  {
 774                 'id' => $self->sigil,
 775                 'tokens' => \@wordlist,
 776                 'name' => $self->identifier,
 777         };
 778         if( $self->is_layered ) {
 779                 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
 780                 $obj->{'layertokens'} = \@lwlist;
 781         }
 782         return $obj;
 783 }
 784
 785 no Moose;
 786 __PACKAGE__->meta->make_immutable;
 787
 788 =head1 BUGS / TODO
 789
 790 =over
 791
 792 =item * Figure out how to serialize a witness
 793
 794 =item * Support encodings other than UTF-8
 795
 796 =back
 797
 798 =head1 LICENSE
 799
 800 This package is free software and is provided "as is" without express
 801 or implied warranty.  You can redistribute it and/or modify it under
 802 the same terms as Perl itself.
 803
 804 =head1 AUTHOR
 805
 806 Tara L Andrews E<lt>aurum@cpan.orgE<gt>