base/lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Moose::Util::TypeConstraints;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9 use XML::Easy::Syntax qw( $xml10_name_rx );
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Witness - a manuscript witness to a text tradition
  14
  15 =head1 SYNOPSIS
  16
  17   use Text::Tradition::Witness;
  18   my $w = Text::Tradition::Witness->new(
  19     'sigil' => 'A',
  20     'identifier' => 'Oxford MS Ex.1932',
  21     );
  22
  23 =head1 DESCRIPTION
  24
  25 Text::Tradition::Witness is an object representation of a manuscript
  26 witness to a text tradition.  A manuscript has a sigil (a short code that
  27 represents it in the wider tradition), an identifier (e.g. the library ID),
  28 and probably a text.
  29
  30 =head1 METHODS
  31
  32 =head2 new
  33
  34 Create a new witness.  Options include:
  35
  36 =over
  37
  38 =item * sigil - A short code to represent the manuscript.  Required.
  39
  40 =item * sourcetype - What sort of witness data this is. Options are
  41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  42 used by Collation parsers.)
  43
  44 =item * file
  45 =item * string
  46 =item * object
  47
  48 The data source for the witness.  Use the appropriate option.
  49
  50 =item * use_text - An initialization option.  If the witness is read from a
  51 TEI document and more than one <text/> tag exists therein, the default
  52 behavior is to use the first defined text.  If this is not desired,
  53 use_text should be set to an XPath expression that will select the correct
  54 text.
  55
  56 =item * identifier - The recognized name of the manuscript, e.g. a library
  57 identifier. Taken from the msDesc element for a TEI file.
  58
  59 =item * other_info - A freeform string for any other description of the
  60 manuscript.
  61
  62 =back
  63
  64 =head2 sigil
  65
  66 The sigil by which to identify this manuscript, which must conform to the
  67 specification for XML attribute strings (broadly speaking, it must begin
  68 with a letter and can have only a few sorts of punctuation characters in
  69 it.)
  70
  71 =head2 identifier
  72
  73 A freeform name by which to identify the manuscript, which may be longer
  74 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  75 TEI msName attribute, or constructed from the settlement and idno if
  76 supplied.
  77
  78 =head2 settlement
  79
  80 The city, town, etc. where the manuscript is held. Will be read from the
  81 TEI msDesc element if supplied.
  82
  83 =head2 repository
  84
  85 The institution that holds the manuscript. Will be read from the TEI msDesc
  86 element if supplied.
  87
  88 =head2 idno
  89
  90 The identification or call number of the manuscript.  Will be read from the
  91 TEI msDesc element if supplied.
  92
  93 =head2 text
  94
  95 An array of strings (words) that contains the text of the
  96 manuscript.  This should not change after the witness has been
  97 instantiated, and the path through the collation should always match it.
  98
  99 =head2 layertext
 100
 101 An array of strings (words) that contains the layered
 102 text, if any, of the manuscript.  This should not change after the witness
 103 has been instantiated, and the path through the collation should always
 104 match it.
 105
 106 =head2 identifier
 107
 108 Accessor method for the witness identifier.
 109
 110 =head2 other_info
 111
 112 Accessor method for the general witness description.
 113
 114 =head2 has_source
 115
 116 Boolean method that returns a true value if the witness was created with a
 117 data source (that is, a file, string, or object to be parsed).
 118
 119 =head2 is_layered
 120
 121 Boolean method to note whether the witness has layers (e.g. pre-correction
 122 readings) in the collation.
 123
 124 =begin testing
 125
 126 use Text::Tradition;
 127 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 128 my $c = $trad->collation;
 129
 130 # Test a plaintext witness via string
 131 my $str = 'This is a line of text';
 132 my $ptwit = $trad->add_witness(
 133     'sigil' => 'A',
 134     'sourcetype' => 'plaintext',
 135     'string' => $str
 136      );
 137 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 138 if( $ptwit ) {
 139     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 140     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 141 }
 142
 143 # Test some JSON witnesses via object
 144 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 145 binmode( JSIN, ':encoding(UTF-8)' );
 146 my @lines = <JSIN>;
 147 close JSIN;
 148 $trad->add_json_witnesses( join( '', @lines ) );
 149 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 150         "Found first JSON witness" );
 151 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 152         "Found second JSON witness" );
 153
 154 # Test an XML witness via file
 155 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 156         'file' => 't/data/witnesses/teiwit.xml' );
 157 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 158 if( $xmlwit ) {
 159         is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 160         ok( $xmlwit->is_layered, "Picked up correction layer" );
 161         is( @{$xmlwit->text}, 182, "Got correct text length" );
 162         is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
 163 }
 164 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
 165 is( @allwitwords, 184, "Reused appropriate readings" );
 166
 167 ## Test use_text
 168 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 169         'file' => 't/data/witnesses/group.xml',
 170         'use_text' => '//tei:group/tei:text[2]' );
 171 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
 172 if( $xpwit ) {
 173         is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
 174         ok( !$xpwit->is_layered, "Picked up no correction layer" );
 175         is( @{$xpwit->text}, 157, "Got correct text length" );
 176 }
 177
 178
 179 =end testing
 180
 181 =cut
 182
 183 # Enable plugin(s) if available
 184 eval { with 'Text::Tradition::WitLanguage'; };
 185
 186 subtype 'SourceType',
 187         as 'Str',
 188         where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
 189         message { 'Source type must be one of xmldesc, plaintext, json, collation' };
 190
 191 subtype 'Sigil',
 192         as 'Str',
 193         where { $_ =~ /\A$xml10_name_rx\z/ },
 194         message { 'Sigil must be a valid XML attribute string' };
 195
 196 no Moose::Util::TypeConstraints;
 197
 198 has 'tradition' => (
 199         'is' => 'ro',
 200         'isa' => 'Text::Tradition',
 201         'required' => 1,
 202         weak_ref => 1
 203         );
 204
 205 # Sigil. Required identifier for a witness, but may be found inside
 206 # the XML file.
 207 has 'sigil' => (
 208         is => 'ro',
 209         isa => 'Sigil',
 210         predicate => 'has_sigil',
 211         writer => '_set_sigil',
 212         );
 213
 214 # Other identifying information
 215 has 'identifier' => (
 216         is => 'rw',
 217         isa => 'Str',
 218         );
 219
 220 has 'settlement' => (
 221         is => 'rw',
 222         isa => 'Str',
 223         );
 224
 225 has 'repository' => (
 226         is => 'rw',
 227         isa => 'Str',
 228         );
 229
 230 has 'idno' => (
 231         is => 'rw',
 232         isa => 'Str',
 233         );
 234
 235 # Source. Can be XML obj, JSON data struct, or string.
 236 # Not used if the witness is created by parsing a collation.
 237 has 'sourcetype' => (
 238         is => 'ro',
 239         isa => 'SourceType',
 240         required => 1,
 241 );
 242
 243 has 'file' => (
 244         is => 'ro',
 245         isa => 'Str',
 246         predicate => 'has_file',
 247 );
 248
 249 has 'string' => (
 250         is => 'ro',
 251         isa => 'Str',
 252         predicate => 'has_string',
 253 );
 254
 255 has 'object' => ( # could be anything.
 256         is => 'ro',
 257         predicate => 'has_object',
 258         clearer => 'clear_object',
 259 );
 260
 261 # In the case of a TEI document with multiple texts, specify
 262 # which text is the root. Should be an XPath expression.
 263 has 'use_text' => (
 264         is => 'ro',
 265         isa => 'Str',
 266         );
 267
 268 # Text.  This is an array of strings (i.e. word tokens).
 269 # TODO Think about how to handle this for the case of pre-prepared
 270 # collations, where the tokens are in the graph already.
 271 has 'text' => (
 272         is => 'rw',
 273         isa => 'ArrayRef[Str]',
 274         predicate => 'has_text',
 275         );
 276
 277 has 'layertext' => (
 278         is => 'rw',
 279         isa => 'ArrayRef[Str]',
 280         predicate => 'has_layertext',
 281         );
 282
 283 has 'is_collated' => (
 284         is => 'rw',
 285         isa => 'Bool'
 286         );
 287
 288 # Path.  This is an array of Reading nodes that can be saved during
 289 # initialization, but should be cleared before saving in a DB.
 290 has 'path' => (
 291         is => 'rw',
 292         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 293         predicate => 'has_path',
 294         clearer => 'clear_path',
 295         );
 296
 297 ## TODO change the name of this
 298 has 'uncorrected_path' => (
 299         is => 'rw',
 300         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 301         clearer => 'clear_uncorrected_path',
 302         );
 303
 304 has 'is_layered' => (
 305         is => 'rw',
 306         isa => 'Bool',
 307         );
 308
 309 # If we set an uncorrected path, ever, remember that we did so.
 310 around 'uncorrected_path' => sub {
 311         my $orig = shift;
 312         my $self = shift;
 313
 314         $self->is_layered( 1 );
 315         $self->$orig( @_ );
 316 };
 317
 318 sub BUILD {
 319         my $self = shift;
 320         if( $self->has_source ) {
 321                 my $init_sub = '_init_from_' . $self->sourcetype;
 322                 $self->$init_sub();
 323                 # Remove our XML / source objects; we no longer need them.
 324                 $self->clear_object if $self->has_object;
 325                 # $self->tradition->collation->make_witness_path( $self );
 326         }
 327         if( $self->sourcetype eq 'collation' ) {
 328                 $self->is_collated( 1 );
 329         }
 330         return $self;
 331 }
 332
 333 sub has_source {
 334         my $self = shift;
 335         return $self->has_file || $self->has_string || $self->has_object;
 336 }
 337
 338 sub _init_from_xmldesc {
 339         my $self = shift;
 340         my $xmlobj;
 341         if( $self->has_object ) {
 342                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 343                         throw( ident => "bad source",
 344                                    message => "Source object must be an XML::LibXML::Element (this is "
 345                                         . ref( $self->object ) . ");" );
 346                 }
 347                 $xmlobj = $self->object;
 348         } else {
 349                 require XML::LibXML;
 350                 my $parser = XML::LibXML->new();
 351                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 352                 try {
 353                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 354                 } catch( XML::LibXML::Error $e ) {
 355                         throw( ident => "bad source",
 356                                    message => "XML parsing error: " . $e->as_string );
 357                 }
 358         }
 359
 360         unless( $xmlobj->nodeName eq 'TEI' ) {
 361                 throw( ident => "bad source",
 362                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 363         }
 364
 365         # Set up the tags we need, with or without namespaces.
 366         map { $tags{$_} = "//$_" }
 367                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 368         # Set up our XPath object
 369         my $xpc = _xpc_for_el( $xmlobj );
 370         # Use namespace-aware tags if we have to
 371         if( $xmlobj->namespaceURI ) {
 372             map { $tags{$_} = "//tei:$_" } keys %tags;
 373         }
 374
 375         # Get the identifier
 376         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 377                 my $descnode = $desc->get_node(1);
 378                 # First try to use settlement/repository/idno.
 379                 my( $setNode, $reposNode, $idNode ) =
 380                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 381                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 382                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 383                 $self->settlement( $setNode ? $setNode->textContent : '' );
 384                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 385                 $self->idno( $idNode ? $idNode->textContent : '' );
 386                 if( $self->settlement && $self->idno ) {
 387                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 388                 } else {
 389                     # Look for an msName.
 390                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 391                     if( $msNameNode ) {
 392                 $self->identifier( $msNameNode->textContent );
 393             } else {
 394                 # We have an msDesc but who knows what is in it?
 395                 my $desc = $descnode->textContent;
 396                 $desc =~ s/\n/ /gs;
 397                 $desc =~ s/\s+/ /g;
 398                 $self->identifier( $desc );
 399             }
 400         }
 401         if( $descnode->hasAttribute('xml:id') ) {
 402                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 403                 } elsif( !$self->has_sigil ) {
 404                         throw( ident => 'missing sigil',
 405                                    message => 'Could not find xml:id witness sigil' );
 406                 }
 407         } else {
 408             throw( ident => "bad source",
 409                    message => "Could not find manuscript description element in TEI header" );
 410         }
 411
 412         # Now get the words out.
 413         my @words;
 414         my @layerwords;  # if the witness has layers
 415         # First, make sure all the words are wrapped in tags.
 416         # TODO Make this not necessarily dependent upon whitespace...
 417         word_tag_wrap( $xmlobj );
 418         # Now go text hunting.
 419         my @textnodes;
 420         if( $self->use_text ) {
 421                 @textnodes = $xpc->findnodes( $self->use_text );
 422         } else {
 423                 # Use the first 'text' node in the document.
 424                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 425         }
 426         my $teitext = $textnodes[0];
 427         if( $teitext ) {
 428                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 429         } else {
 430             throw( ident => "bad source",
 431                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 432         }
 433
 434         my @text = map { $_->text } @words;
 435         my @layertext = map { $_->text } @layerwords;
 436         $self->path( \@words );
 437         $self->text( \@text );
 438         if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 439                 $self->uncorrected_path( \@layerwords );
 440                 $self->layertext( \@layertext );
 441         }
 442 }
 443
 444 sub _tokenize_text {
 445         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 446         # Strip out the words.
 447         my $xpc = _xpc_for_el( $teitext );
 448         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 449         foreach( @divs ) {
 450                 my $place_str;
 451                 if( my $n = $_->getAttribute( 'n' ) ) {
 452                         $place_str = '#DIV_' . $n . '#';
 453                 } else {
 454                         $place_str = '#DIV#';
 455                 }
 456                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 457         }  # foreach <div/>
 458
 459         # But maybe we don't have any divs.  Just paragraphs.
 460         unless( @divs ) {
 461                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 462         }
 463 }
 464
 465 sub _objectify_words {
 466         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 467
 468         my $xpc = _xpc_for_el( $element );
 469         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 470         my @pgraphs = $xpc->findnodes( $xpexpr );
 471     return () unless @pgraphs;
 472     # Set up an expression to look for words and segs
 473     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 474         foreach my $pg ( @pgraphs ) {
 475                 # If this paragraph is the descendant of a note element,
 476                 # skip it.
 477                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 478                 next if scalar @noop_container;
 479                 # Get the text of each node
 480                 my $first_word = 1;
 481                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 482                 # of it, if necessary.
 483                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 484                         my( $text, $uncorr ) = _get_word_strings( $c );
 485 #                       try {
 486 #                               ( $text, $uncorr ) = _get_word_object( $c );
 487 #                       } catch( Text::Tradition::Error $e
 488 #                                               where { $_->has_tag( 'lb' ) } ) {
 489 #                               next;
 490 #                       }
 491                         unless( defined $text || defined $uncorr ) {
 492                                 print STDERR "WARNING: no text in node " . $c->nodeName
 493                                         . "\n" unless $c->nodeName eq 'lb';
 494                                 next;
 495                         }
 496                         print STDERR "DEBUG: space found in element node "
 497                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 498
 499                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 500                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 501                                 $ctr++;
 502                         }
 503                         my $id = $self->sigil . 'r' . $ctr;
 504                         my( $word, $acword );
 505                         if( $text ) {
 506                                 $word = $self->tradition->collation->add_reading(
 507                                         { 'id' => $id, 'text' => $text });
 508                         }
 509                         if( $uncorr && $uncorr ne $text ) {
 510                                 $id .= '_ac';
 511                                 $acword = $self->tradition->collation->add_reading(
 512                                         { 'id' => $id, 'text' => $uncorr });
 513                         } elsif( $uncorr ) {
 514                                 $acword = $word;
 515                         }
 516
 517 #                       if( $first_word ) {
 518 #                               $first_word = 0;
 519 #                               # Set the relevant sectioning markers
 520 #                               if( $divmarker ) {
 521 #                                       $w->add_placeholder( $divmarker );
 522 #                                       $divmarker = undef;
 523 #                               }
 524 #                               $w->add_placeholder( '#PG#' );
 525 #                       }
 526                         push( @$wordlist, $word ) if $word;
 527                         push( @$uncorrlist, $acword ) if $acword;
 528                 }
 529     }
 530 }
 531
 532 # Given a word or segment node, make a Reading object for the word
 533 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 534 # 'corrected' state.
 535
 536 sub _get_word_strings {
 537         my( $node ) = @_;
 538         my( $text, $uncorrtext );
 539         # We can have an lb or pb in the middle of a word; if we do, the
 540         # whitespace (including \n) after the break becomes insignificant
 541         # and we want to nuke it.
 542         my $strip_leading_space = 0;
 543         my $word_excluded = 0;
 544         my $xpc = _xpc_for_el( $node );
 545         # TODO This does not cope with nested add/dels.
 546         my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
 547         my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 548         foreach my $c ($node->childNodes() ) {
 549                 if( $c->nodeName eq 'num'
 550                         && defined $c->getAttribute( 'value' ) ) {
 551                         # Push the number.
 552                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 553                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 554                         # If this is just after a line/page break, return to normal behavior.
 555                         $strip_leading_space = 0;
 556                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 557                         # Set a flag that strips leading whitespace until we
 558                         # get to the next bit of non-whitespace.
 559                         $strip_leading_space = 1;
 560                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 561                                   || $c->nodeName eq 'sic'
 562                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 563                                   || $c->textContent eq ''
 564                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 565                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 566                         next;
 567                 } elsif( $c->nodeName eq 'add' ) {
 568                         my( $use, $discard ) = _get_word_strings( $c );
 569                         $text .= $use;
 570                 } elsif( $c->nodeName eq 'del' ) {
 571                         my( $discard, $use ) = _get_word_strings( $c );
 572                         $uncorrtext .= $use;
 573                 } else {
 574                         my ( $tagtxt, $taguncorr );
 575                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 576                                 # A text node.
 577                                 $tagtxt = $c->textContent;
 578                                 $taguncorr = $c->textContent;
 579                         } else {
 580                                 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 581                         }
 582                         if( $strip_leading_space ) {
 583                                 $tagtxt =~ s/^[\s\n]+//s;
 584                                 $taguncorr =~ s/^[\s\n]+//s;
 585                                 # Unset the flag as soon as we see non-whitespace.
 586                                 $strip_leading_space = 0 if $tagtxt;
 587                         }
 588                         $text .= $tagtxt;
 589                         $uncorrtext .= $taguncorr;
 590                 }
 591         }
 592         throw( ident => "text not found",
 593                tags => [ $node->nodeName ],
 594                message => "No text found in node " . $node->toString(0) )
 595             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 596         return( $text, $uncorrtext );
 597 }
 598
 599 sub _split_words {
 600         my( $self, $string, $c ) = @_;
 601         my @raw_words = split( /\s+/, $string );
 602         my @words;
 603         foreach my $w ( @raw_words ) {
 604                 my $id = $self->sigil . 'r'. $c++;
 605                 my %opts = ( 'text' => $w, 'id' => $id );
 606                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 607                 # Skip any words that have been canonized out of existence.
 608                 next if( length( $w_obj->text ) == 0 );
 609                 push( @words, $w_obj );
 610         }
 611         return @words;
 612 }
 613
 614 sub _init_from_json {
 615         my( $self ) = shift;
 616         my $wit;
 617         if( $self->has_object ) {
 618                 $wit = $self->object;
 619         } elsif( $self->has_string ) {
 620                 $wit = from_json( $self->string );
 621         } elsif( $self->has_file ) {
 622         my $ok = open( INPUT, $self->file );
 623         unless( $ok ) {
 624                         throw( ident => "bad source",
 625                                    message => 'Could not open ' . $self->file . ' for reading' );
 626         }
 627         binmode( INPUT, ':encoding(UTF-8)' );
 628         my @lines = <INPUT>;
 629         close INPUT;
 630         $wit = from_json( join( '', @lines ) );
 631         }
 632
 633         if( exists $wit->{'id'} ) {
 634                 $self->_set_sigil( $wit->{'id'} );
 635         } elsif( !$self->has_sigil ) {
 636                 throw( ident => 'missing sigil',
 637                            message => 'Could not find witness sigil (id) in JSON spec' );
 638         }
 639         $self->identifier( $wit->{'name'} );
 640         my @words;
 641         my @layerwords;
 642         my( @text, @layertext );
 643         if( exists $wit->{'content'} ) {
 644                 # We need to tokenize the text ourselves.
 645                 @words = _split_words( $self, $wit->{'content'} );
 646         } elsif( exists $wit->{'tokens'} ) {
 647                 # We have a bunch of pretokenized words.
 648                 my $ctr = 0;
 649                 foreach my $token ( @{$wit->{'tokens'}} ) {
 650                         my $w_obj = $self->tradition->collation->add_reading({
 651                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 652                         push( @words, $w_obj );
 653                         push( @text, $token->{'t'} ); # TODO unless...?
 654                 }
 655                 ## TODO rethink this JSOn mechanism
 656                 if( exists $wit->{'layertokens'} ) {
 657                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 658                                 my $w_obj = $self->tradition->collation->add_reading({
 659                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 660                                 push( @layerwords, $w_obj );
 661                                 push( @layertext, $token->{'t'} );
 662                         }
 663                 }
 664         }
 665         $self->text( \@text );
 666         $self->layertext( \@layertext ) if @layertext;
 667         $self->path( \@words );
 668         $self->uncorrected_path( \@layerwords ) if @layerwords;
 669 }
 670
 671 sub _init_from_plaintext {
 672     my( $self ) = @_;
 673     unless( $self->has_sigil ) {
 674         throw( "No sigil defined for the plaintext witness" );
 675     }
 676     my $str;
 677     if( $self->has_file ) {
 678         my $ok = open( INPUT, $self->file );
 679         unless( $ok ) {
 680                         throw( ident => "bad source",
 681                                    message => 'Could not open ' . $self->file . ' for reading' );
 682         }
 683         binmode( INPUT, ':encoding(UTF-8)' );
 684         my @lines = <INPUT>;
 685         close INPUT;
 686         $str = join( '', @lines );
 687     } elsif( $self->has_object ) { # ...seriously?
 688         $str = ${$self->object};
 689     } else {
 690         $str = $self->string;
 691     }
 692
 693     # TODO allow a different word separation expression
 694     my @text = split( /\s+/, $str );
 695     $self->text( \@text );
 696     my @words = _split_words( $self, $str );
 697         $self->path( \@words );
 698 }
 699
 700 sub throw {
 701         Text::Tradition::Error->throw(
 702                 'ident' => 'Witness parsing error',
 703                 'message' => $_[0],
 704                 );
 705 }
 706
 707 sub _xpc_for_el {
 708         my $el = shift;
 709         my $xpc = XML::LibXML::XPathContext->new( $el );
 710                 if( $el->namespaceURI ) {
 711                         $xpc->registerNs( 'tei', $el->namespaceURI );
 712                 }
 713         return $xpc;
 714 }
 715
 716 =head2 export_as_json
 717
 718 Exports the witness as a JSON structure, with the following keys:
 719
 720 =over 4
 721
 722 =item * id - The witness sigil
 723
 724 =item * name - The witness identifier
 725
 726 =item * tokens - An array of hashes of the form { "t":"WORD" }
 727
 728 =back
 729
 730 =begin testing
 731
 732 use Text::Tradition;
 733 my $trad = Text::Tradition->new();
 734
 735 my @text = qw/ Thhis is a line of text /;
 736 my $wit = $trad->add_witness(
 737     'sigil' => 'A',
 738     'string' => join( ' ', @text ),
 739     'sourcetype' => 'plaintext',
 740     'identifier' => 'test witness',
 741      );
 742 my $jsonstruct = $wit->export_as_json;
 743 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 744 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 745 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 746 foreach my $idx ( 0 .. $#text ) {
 747         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 748 }
 749
 750 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 751                                 has pierced unto the root );
 752 $trad = Text::Tradition->new(
 753         'input' => 'CollateX',
 754         'file' => 't/data/Collatex-16.xml' );
 755
 756 $jsonstruct = $trad->witness('A')->export_as_json;
 757 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 758 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 759 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 760 foreach my $idx ( 0 .. $#ctext ) {
 761         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 762 }
 763
 764 ## TODO test layertext export
 765
 766 =end testing
 767
 768 =cut
 769
 770 sub export_as_json {
 771         my $self = shift;
 772         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 773         my $obj =  {
 774                 'id' => $self->sigil,
 775                 'tokens' => \@wordlist,
 776                 'name' => $self->identifier,
 777         };
 778         if( $self->is_layered ) {
 779                 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
 780                 $obj->{'layertokens'} = \@lwlist;
 781         }
 782         return $obj;
 783 }
 784
 785 no Moose;
 786 __PACKAGE__->meta->make_immutable;
 787
 788 =head1 BUGS / TODO
 789
 790 =over
 791
 792 =item * Figure out how to serialize a witness
 793
 794 =item * Support encodings other than UTF-8
 795
 796 =back
 797
 798 =head1 LICENSE
 799
 800 This package is free software and is provided "as is" without express
 801 or implied warranty.  You can redistribute it and/or modify it under
 802 the same terms as Perl itself.
 803
 804 =head1 AUTHOR
 805
 806 Tara L Andrews E<lt>aurum@cpan.orgE<gt>