base/lib/Text/Tradition/Witness.pm

   1 package Text::Tradition::Witness;
   2
   3 use vars qw( %tags );
   4 use JSON;
   5 use Moose;
   6 use Moose::Util::TypeConstraints;
   7 use Text::TEI::Markup qw( word_tag_wrap );
   8 use TryCatch;
   9 use XML::Easy::Syntax qw( $xml10_name_rx );
  10
  11 =head1 NAME
  12
  13 Text::Tradition::Witness - a manuscript witness to a text tradition
  14
  15 =head1 SYNOPSIS
  16
  17   use Text::Tradition::Witness;
  18   my $w = Text::Tradition::Witness->new(
  19     'sigil' => 'A',
  20     'identifier' => 'Oxford MS Ex.1932',
  21     );
  22
  23 =head1 DESCRIPTION
  24
  25 Text::Tradition::Witness is an object representation of a manuscript
  26 witness to a text tradition.  A manuscript has a sigil (a short code that
  27 represents it in the wider tradition), an identifier (e.g. the library ID),
  28 and probably a text.
  29
  30 =head1 METHODS
  31
  32 =head2 new
  33
  34 Create a new witness.  Options include:
  35
  36 =over
  37
  38 =item * sigil - A short code to represent the manuscript.  Required.
  39
  40 =item * sourcetype - What sort of witness data this is. Options are
  41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
  42 used by Collation parsers.)
  43
  44 =item * file
  45 =item * string
  46 =item * object
  47
  48 The data source for the witness.  Use the appropriate option.
  49
  50 =item * use_text - An initialization option.  If the witness is read from a
  51 TEI document and more than one <text/> tag exists therein, the default
  52 behavior is to use the first defined text.  If this is not desired,
  53 use_text should be set to an XPath expression that will select the correct
  54 text.
  55
  56 =item * identifier - The recognized name of the manuscript, e.g. a library
  57 identifier. Taken from the msDesc element for a TEI file.
  58
  59 =item * other_info - A freeform string for any other description of the
  60 manuscript.
  61
  62 =back
  63
  64 =head2 sigil
  65
  66 The sigil by which to identify this manuscript, which must conform to the
  67 specification for XML attribute strings (broadly speaking, it must begin
  68 with a letter and can have only a few sorts of punctuation characters in
  69 it.)
  70
  71 =head2 identifier
  72
  73 A freeform name by which to identify the manuscript, which may be longer
  74 than the sigil.  Defaults to 'Unidentified ms', but will be taken from the
  75 TEI msName attribute, or constructed from the settlement and idno if
  76 supplied.
  77
  78 =head2 settlement
  79
  80 The city, town, etc. where the manuscript is held. Will be read from the
  81 TEI msDesc element if supplied.
  82
  83 =head2 repository
  84
  85 The institution that holds the manuscript. Will be read from the TEI msDesc
  86 element if supplied.
  87
  88 =head2 idno
  89
  90 The identification or call number of the manuscript.  Will be read from the
  91 TEI msDesc element if supplied.
  92
  93 =head2 text
  94
  95 An array of strings (words) that contains the text of the
  96 manuscript.  This should not change after the witness has been
  97 instantiated, and the path through the collation should always match it.
  98
  99 =head2 layertext
 100
 101 An array of strings (words) that contains the layered
 102 text, if any, of the manuscript.  This should not change after the witness
 103 has been instantiated, and the path through the collation should always
 104 match it.
 105
 106 =head2 identifier
 107
 108 Accessor method for the witness identifier.
 109
 110 =head2 other_info
 111
 112 Accessor method for the general witness description.
 113
 114 =head2 has_source
 115
 116 Boolean method that returns a true value if the witness was created with a
 117 data source (that is, a file, string, or object to be parsed).
 118
 119 =head2 is_layered
 120
 121 Boolean method to note whether the witness has layers (e.g. pre-correction
 122 readings) in the collation.
 123
 124 =begin testing
 125
 126 use Text::Tradition;
 127 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
 128 my $c = $trad->collation;
 129
 130 # Test a plaintext witness via string
 131 my $str = 'This is a line of text';
 132 my $ptwit = $trad->add_witness(
 133     'sigil' => 'A',
 134     'sourcetype' => 'plaintext',
 135     'string' => $str
 136      );
 137 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
 138 if( $ptwit ) {
 139     is( $ptwit->sigil, 'A', "Witness has correct sigil" );
 140     $c->make_witness_path( $ptwit );
 141     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 142 }
 143
 144 # Test some JSON witnesses via object
 145 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
 146 binmode( JSIN, ':encoding(UTF-8)' );
 147 my @lines = <JSIN>;
 148 close JSIN;
 149 $trad->add_json_witnesses( join( '', @lines ) );
 150 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 151         "Found first JSON witness" );
 152 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
 153         "Found second JSON witness" );
 154
 155 # Test an XML witness via file
 156 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 157         'file' => 't/data/witnesses/teiwit.xml' );
 158 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
 159 if( $xmlwit ) {
 160         is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
 161         ok( $xmlwit->is_layered, "Picked up correction layer" );
 162         is( @{$xmlwit->text}, 182, "Got correct text length" );
 163         is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
 164 }
 165 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
 166 is( @allwitwords, 184, "Reused appropriate readings" );
 167
 168 ## Test use_text
 169 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
 170         'file' => 't/data/witnesses/group.xml',
 171         'use_text' => '//tei:group/tei:text[2]' );
 172 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
 173 if( $xpwit ) {
 174         is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
 175         ok( !$xpwit->is_layered, "Picked up no correction layer" );
 176         is( @{$xpwit->text}, 157, "Got correct text length" );
 177 }
 178
 179
 180 =end testing
 181
 182 =cut
 183
 184 # Enable plugin(s) if available
 185 eval { with 'Text::Tradition::WitLanguage'; };
 186
 187 subtype 'SourceType',
 188         as 'Str',
 189         where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
 190         message { 'Source type must be one of xmldesc, plaintext, json, collation' };
 191
 192 subtype 'Sigil',
 193         as 'Str',
 194         where { $_ =~ /\A$xml10_name_rx\z/ },
 195         message { 'Sigil must be a valid XML attribute string' };
 196
 197 no Moose::Util::TypeConstraints;
 198
 199 has 'tradition' => (
 200         is => 'ro',
 201         isa => 'Text::Tradition',
 202         required => 1,
 203         weak_ref => 1
 204         );
 205
 206 # Sigil. Required identifier for a witness, but may be found inside
 207 # the XML file.
 208 has 'sigil' => (
 209         is => 'ro',
 210         isa => 'Sigil',
 211         predicate => 'has_sigil',
 212         writer => '_set_sigil',
 213         );
 214
 215 # Other identifying information
 216 has 'identifier' => (
 217         is => 'rw',
 218         isa => 'Str',
 219         );
 220
 221 has 'settlement' => (
 222         is => 'rw',
 223         isa => 'Str',
 224         );
 225
 226 has 'repository' => (
 227         is => 'rw',
 228         isa => 'Str',
 229         );
 230
 231 has 'idno' => (
 232         is => 'rw',
 233         isa => 'Str',
 234         );
 235
 236 # Source. Can be XML obj, JSON data struct, or string.
 237 # Not used if the witness is created by parsing a collation.
 238 has 'sourcetype' => (
 239         is => 'ro',
 240         isa => 'SourceType',
 241         required => 1,
 242 );
 243
 244 has 'file' => (
 245         is => 'ro',
 246         isa => 'Str',
 247         predicate => 'has_file',
 248 );
 249
 250 has 'string' => (
 251         is => 'ro',
 252         isa => 'Str',
 253         predicate => 'has_string',
 254 );
 255
 256 has 'object' => ( # could be anything.
 257         is => 'ro',
 258         predicate => 'has_object',
 259         clearer => 'clear_object',
 260 );
 261
 262 # In the case of a TEI document with multiple texts, specify
 263 # which text is the root. Should be an XPath expression.
 264 has 'use_text' => (
 265         is => 'ro',
 266         isa => 'Str',
 267         );
 268
 269 # Text.  This is an array of strings (i.e. word tokens).
 270 # TODO Think about how to handle this for the case of pre-prepared
 271 # collations, where the tokens are in the graph already.
 272 has 'text' => (
 273         is => 'rw',
 274         isa => 'ArrayRef[Str]',
 275         predicate => 'has_text',
 276         );
 277
 278 has 'layertext' => (
 279         is => 'rw',
 280         isa => 'ArrayRef[Str]',
 281         predicate => 'has_layertext',
 282         );
 283
 284 has 'is_collated' => (
 285         is => 'rw',
 286         isa => 'Bool'
 287         );
 288
 289 # Path.  This is an array of Reading nodes that can be saved during
 290 # initialization, but should be cleared before saving in a DB.
 291 has 'path' => (
 292         is => 'rw',
 293         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 294         predicate => 'has_path',
 295         clearer => 'clear_path',
 296         );
 297
 298 ## TODO change the name of this
 299 has 'uncorrected_path' => (
 300         is => 'rw',
 301         isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
 302         clearer => 'clear_uncorrected_path',
 303         );
 304
 305 has 'is_layered' => (
 306         is => 'rw',
 307         isa => 'Bool',
 308         );
 309
 310 # If we set an uncorrected path, ever, remember that we did so.
 311 around 'uncorrected_path' => sub {
 312         my $orig = shift;
 313         my $self = shift;
 314
 315         $self->is_layered( 1 );
 316         $self->$orig( @_ );
 317 };
 318
 319 sub BUILD {
 320         my $self = shift;
 321         if( $self->has_source ) {
 322                 my $init_sub = '_init_from_' . $self->sourcetype;
 323                 $self->$init_sub();
 324                 # Remove our XML / source objects; we no longer need them.
 325                 $self->clear_object if $self->has_object;
 326                 # $self->tradition->collation->make_witness_path( $self );
 327         }
 328         if( $self->sourcetype eq 'collation' ) {
 329                 $self->is_collated( 1 );
 330         }
 331         return $self;
 332 }
 333
 334 sub has_source {
 335         my $self = shift;
 336         return $self->has_file || $self->has_string || $self->has_object;
 337 }
 338
 339 sub _init_from_xmldesc {
 340         my $self = shift;
 341         my $xmlobj;
 342         if( $self->has_object ) {
 343                 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
 344                         throw( ident => "bad source",
 345                                    message => "Source object must be an XML::LibXML::Element (this is "
 346                                         . ref( $self->object ) . ");" );
 347                 }
 348                 $xmlobj = $self->object;
 349         } else {
 350                 require XML::LibXML;
 351                 my $parser = XML::LibXML->new();
 352                 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 353                 try {
 354                         $xmlobj = $parser->$parsersub( $self->file )->documentElement;
 355                 } catch( XML::LibXML::Error $e ) {
 356                         throw( ident => "bad source",
 357                                    message => "XML parsing error: " . $e->as_string );
 358                 }
 359         }
 360
 361         unless( $xmlobj->nodeName eq 'TEI' ) {
 362                 throw( ident => "bad source",
 363                        message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
 364         }
 365
 366         # Set up the tags we need, with or without namespaces.
 367         map { $tags{$_} = "//$_" }
 368                 qw/ msDesc msName settlement repository idno p lg w seg add del /;
 369         # Set up our XPath object
 370         my $xpc = _xpc_for_el( $xmlobj );
 371         # Use namespace-aware tags if we have to
 372         if( $xmlobj->namespaceURI ) {
 373             map { $tags{$_} = "//tei:$_" } keys %tags;
 374         }
 375
 376         # Get the identifier
 377         if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 378                 my $descnode = $desc->get_node(1);
 379                 # First try to use settlement/repository/idno.
 380                 my( $setNode, $reposNode, $idNode ) =
 381                         ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
 382                           $xpc->find( $tags{repository}, $descnode )->get_node(1),
 383                           $xpc->find( $tags{idno}, $descnode )->get_node(1) );
 384                 $self->settlement( $setNode ? $setNode->textContent : '' );
 385                 $self->repository( $reposNode ? $reposNode->textContent : '' );
 386                 $self->idno( $idNode ? $idNode->textContent : '' );
 387                 if( $self->settlement && $self->idno ) {
 388                 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
 389                 } else {
 390                     # Look for an msName.
 391                     my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
 392                     if( $msNameNode ) {
 393                 $self->identifier( $msNameNode->textContent );
 394             } else {
 395                 # We have an msDesc but who knows what is in it?
 396                 my $desc = $descnode->textContent;
 397                 $desc =~ s/\n/ /gs;
 398                 $desc =~ s/\s+/ /g;
 399                 $self->identifier( $desc );
 400             }
 401         }
 402         if( $descnode->hasAttribute('xml:id') ) {
 403                         $self->_set_sigil( $descnode->getAttribute('xml:id') );
 404                 } elsif( !$self->has_sigil ) {
 405                         throw( ident => 'missing sigil',
 406                                    message => 'Could not find xml:id witness sigil' );
 407                 }
 408         } else {
 409             throw( ident => "bad source",
 410                    message => "Could not find manuscript description element in TEI header" );
 411         }
 412
 413         # Now get the words out.
 414         my @words;
 415         my @layerwords;  # if the witness has layers
 416         # First, make sure all the words are wrapped in tags.
 417         # TODO Make this not necessarily dependent upon whitespace...
 418         word_tag_wrap( $xmlobj );
 419         # Now go text hunting.
 420         my @textnodes;
 421         if( $self->use_text ) {
 422                 @textnodes = $xpc->findnodes( $self->use_text );
 423         } else {
 424                 # Use the first 'text' node in the document.
 425                 @textnodes = $xmlobj->getElementsByTagName( 'text' );
 426         }
 427         my $teitext = $textnodes[0];
 428         if( $teitext ) {
 429                 _tokenize_text( $self, $teitext, \@words, \@layerwords );
 430         } else {
 431             throw( ident => "bad source",
 432                    message => "No text element in document '" . $self->{'identifier'} . "!" );
 433         }
 434
 435         my @text = map { $_->text } @words;
 436         my @layertext = map { $_->text } @layerwords;
 437         $self->path( \@words );
 438         $self->text( \@text );
 439         if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 440                 $self->uncorrected_path( \@layerwords );
 441                 $self->layertext( \@layertext );
 442         }
 443 }
 444
 445 sub _tokenize_text {
 446         my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
 447         # Strip out the words.
 448         my $xpc = _xpc_for_el( $teitext );
 449         my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
 450         foreach( @divs ) {
 451                 my $place_str;
 452                 if( my $n = $_->getAttribute( 'n' ) ) {
 453                         $place_str = '#DIV_' . $n . '#';
 454                 } else {
 455                         $place_str = '#DIV#';
 456                 }
 457                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
 458         }  # foreach <div/>
 459
 460         # But maybe we don't have any divs.  Just paragraphs.
 461         unless( @divs ) {
 462                 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
 463         }
 464 }
 465
 466 sub _objectify_words {
 467         my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
 468
 469         my $xpc = _xpc_for_el( $element );
 470         my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
 471         my @pgraphs = $xpc->findnodes( $xpexpr );
 472     return () unless @pgraphs;
 473     # Set up an expression to look for words and segs
 474     $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
 475         foreach my $pg ( @pgraphs ) {
 476                 # If this paragraph is the descendant of a note element,
 477                 # skip it.
 478                 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
 479                 next if scalar @noop_container;
 480                 # Get the text of each node
 481                 my $first_word = 1;
 482                 # Hunt down each wrapped word/seg, and make an object (or two objects)
 483                 # of it, if necessary.
 484                 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
 485                         my( $text, $uncorr ) = _get_word_strings( $c );
 486 #                       try {
 487 #                               ( $text, $uncorr ) = _get_word_object( $c );
 488 #                       } catch( Text::Tradition::Error $e
 489 #                                               where { $_->has_tag( 'lb' ) } ) {
 490 #                               next;
 491 #                       }
 492                         unless( defined $text || defined $uncorr ) {
 493                                 print STDERR "WARNING: no text in node " . $c->nodeName
 494                                         . "\n" unless $c->nodeName eq 'lb';
 495                                 next;
 496                         }
 497                         print STDERR "DEBUG: space found in element node "
 498                                 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
 499
 500                         my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
 501                         while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
 502                                 $ctr++;
 503                         }
 504                         my $id = $self->sigil . 'r' . $ctr;
 505                         my( $word, $acword );
 506                         if( $text ) {
 507                                 $word = $self->tradition->collation->add_reading(
 508                                         { 'id' => $id, 'text' => $text });
 509                         }
 510                         if( $uncorr && $uncorr ne $text ) {
 511                                 $id .= '_ac';
 512                                 $acword = $self->tradition->collation->add_reading(
 513                                         { 'id' => $id, 'text' => $uncorr });
 514                         } elsif( $uncorr ) {
 515                                 $acword = $word;
 516                         }
 517
 518 #                       if( $first_word ) {
 519 #                               $first_word = 0;
 520 #                               # Set the relevant sectioning markers
 521 #                               if( $divmarker ) {
 522 #                                       $w->add_placeholder( $divmarker );
 523 #                                       $divmarker = undef;
 524 #                               }
 525 #                               $w->add_placeholder( '#PG#' );
 526 #                       }
 527                         push( @$wordlist, $word ) if $word;
 528                         push( @$uncorrlist, $acword ) if $acword;
 529                 }
 530     }
 531 }
 532
 533 # Given a word or segment node, make a Reading object for the word
 534 # therein. Make two Reading objects if there is an 'uncorrected' vs.
 535 # 'corrected' state.
 536
 537 sub _get_word_strings {
 538         my( $node ) = @_;
 539         my( $text, $uncorrtext );
 540         # We can have an lb or pb in the middle of a word; if we do, the
 541         # whitespace (including \n) after the break becomes insignificant
 542         # and we want to nuke it.
 543         my $strip_leading_space = 0;
 544         my $word_excluded = 0;
 545         my $xpc = _xpc_for_el( $node );
 546         # TODO This does not cope with nested add/dels.
 547         my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
 548         my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 549         foreach my $c ($node->childNodes() ) {
 550                 if( $c->nodeName eq 'num'
 551                         && defined $c->getAttribute( 'value' ) ) {
 552                         # Push the number.
 553                         $text .= $c->getAttribute( 'value' ) unless @deletion;
 554                         $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
 555                         # If this is just after a line/page break, return to normal behavior.
 556                         $strip_leading_space = 0;
 557                 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
 558                         # Set a flag that strips leading whitespace until we
 559                         # get to the next bit of non-whitespace.
 560                         $strip_leading_space = 1;
 561                 } elsif ( $c->nodeName eq 'fw'   # for catchwords
 562                                   || $c->nodeName eq 'sic'
 563                                   || $c->nodeName eq 'note'      #TODO: decide how to deal with notes
 564                                   || $c->textContent eq ''
 565                                   || ref( $c ) eq 'XML::LibXML::Comment' ) {
 566                         $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 567                         next;
 568                 } elsif( $c->nodeName eq 'add' ) {
 569                         my( $use, $discard ) = _get_word_strings( $c );
 570                         $text .= $use;
 571                 } elsif( $c->nodeName eq 'del' ) {
 572                         my( $discard, $use ) = _get_word_strings( $c );
 573                         $uncorrtext .= $use;
 574                 } else {
 575                         my ( $tagtxt, $taguncorr );
 576                         if( ref( $c ) eq 'XML::LibXML::Text' ) {
 577                                 # A text node.
 578                                 $tagtxt = $c->textContent;
 579                                 $taguncorr = $c->textContent;
 580                         } else {
 581                                 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 582                         }
 583                         if( $strip_leading_space ) {
 584                                 $tagtxt =~ s/^[\s\n]+//s;
 585                                 $taguncorr =~ s/^[\s\n]+//s;
 586                                 # Unset the flag as soon as we see non-whitespace.
 587                                 $strip_leading_space = 0 if $tagtxt;
 588                         }
 589                         $text .= $tagtxt;
 590                         $uncorrtext .= $taguncorr;
 591                 }
 592         }
 593         throw( ident => "text not found",
 594                tags => [ $node->nodeName ],
 595                message => "No text found in node " . $node->toString(0) )
 596             unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
 597         return( $text, $uncorrtext );
 598 }
 599
 600 sub _split_words {
 601         my( $self, $string, $c ) = @_;
 602         my @raw_words = split( /\s+/, $string );
 603         my @words;
 604         foreach my $w ( @raw_words ) {
 605                 my $id = $self->sigil . 'r'. $c++;
 606                 my %opts = ( 'text' => $w, 'id' => $id );
 607                 my $w_obj = $self->tradition->collation->add_reading( \%opts );
 608                 # Skip any words that have been canonized out of existence.
 609                 next if( length( $w_obj->text ) == 0 );
 610                 push( @words, $w_obj );
 611         }
 612         return @words;
 613 }
 614
 615 sub _init_from_json {
 616         my( $self ) = shift;
 617         my $wit;
 618         if( $self->has_object ) {
 619                 $wit = $self->object;
 620         } elsif( $self->has_string ) {
 621                 $wit = from_json( $self->string );
 622         } elsif( $self->has_file ) {
 623         my $ok = open( INPUT, $self->file );
 624         unless( $ok ) {
 625                         throw( ident => "bad source",
 626                                    message => 'Could not open ' . $self->file . ' for reading' );
 627         }
 628         binmode( INPUT, ':encoding(UTF-8)' );
 629         my @lines = <INPUT>;
 630         close INPUT;
 631         $wit = from_json( join( '', @lines ) );
 632         }
 633
 634         if( exists $wit->{'id'} ) {
 635                 $self->_set_sigil( $wit->{'id'} );
 636         } elsif( !$self->has_sigil ) {
 637                 throw( ident => 'missing sigil',
 638                            message => 'Could not find witness sigil (id) in JSON spec' );
 639         }
 640         $self->identifier( $wit->{'name'} );
 641         my @words;
 642         my @layerwords;
 643         my( @text, @layertext );
 644         if( exists $wit->{'content'} ) {
 645                 # We need to tokenize the text ourselves.
 646                 @words = _split_words( $self, $wit->{'content'} );
 647         } elsif( exists $wit->{'tokens'} ) {
 648                 # We have a bunch of pretokenized words.
 649                 my $ctr = 0;
 650                 foreach my $token ( @{$wit->{'tokens'}} ) {
 651                         my $w_obj = $self->tradition->collation->add_reading({
 652                                 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 653                         push( @words, $w_obj );
 654                         push( @text, $token->{'t'} ); # TODO unless...?
 655                 }
 656                 ## TODO rethink this JSOn mechanism
 657                 if( exists $wit->{'layertokens'} ) {
 658                         foreach my $token ( @{$wit->{'layertokens'}} ) {
 659                                 my $w_obj = $self->tradition->collation->add_reading({
 660                                         'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 661                                 push( @layerwords, $w_obj );
 662                                 push( @layertext, $token->{'t'} );
 663                         }
 664                 }
 665         }
 666         $self->text( \@text );
 667         $self->layertext( \@layertext ) if @layertext;
 668         $self->path( \@words );
 669         $self->uncorrected_path( \@layerwords ) if @layerwords;
 670 }
 671
 672 sub _init_from_plaintext {
 673     my( $self ) = @_;
 674     unless( $self->has_sigil ) {
 675         throw( "No sigil defined for the plaintext witness" );
 676     }
 677     my $str;
 678     if( $self->has_file ) {
 679         my $ok = open( INPUT, $self->file );
 680         unless( $ok ) {
 681                         throw( ident => "bad source",
 682                                    message => 'Could not open ' . $self->file . ' for reading' );
 683         }
 684         binmode( INPUT, ':encoding(UTF-8)' );
 685         my @lines = <INPUT>;
 686         close INPUT;
 687         $str = join( '', @lines );
 688     } elsif( $self->has_object ) { # ...seriously?
 689         $str = ${$self->object};
 690     } else {
 691         $str = $self->string;
 692     }
 693
 694     # TODO allow a different word separation expression
 695     my @text = split( /\s+/, $str );
 696     $self->text( \@text );
 697     my @words = _split_words( $self, $str );
 698         $self->path( \@words );
 699 }
 700
 701 sub throw {
 702         Text::Tradition::Error->throw(
 703                 'ident' => 'Witness parsing error',
 704                 'message' => $_[0],
 705                 );
 706 }
 707
 708 sub _xpc_for_el {
 709         my $el = shift;
 710         my $xpc = XML::LibXML::XPathContext->new( $el );
 711                 if( $el->namespaceURI ) {
 712                         $xpc->registerNs( 'tei', $el->namespaceURI );
 713                 }
 714         return $xpc;
 715 }
 716
 717 =head2 export_as_json
 718
 719 Exports the witness as a JSON structure, with the following keys:
 720
 721 =over 4
 722
 723 =item * id - The witness sigil
 724
 725 =item * name - The witness identifier
 726
 727 =item * tokens - An array of hashes of the form { "t":"WORD" }
 728
 729 =back
 730
 731 =begin testing
 732
 733 use Text::Tradition;
 734 my $trad = Text::Tradition->new();
 735
 736 my @text = qw/ Thhis is a line of text /;
 737 my $wit = $trad->add_witness(
 738     'sigil' => 'A',
 739     'string' => join( ' ', @text ),
 740     'sourcetype' => 'plaintext',
 741     'identifier' => 'test witness',
 742      );
 743 my $jsonstruct = $wit->export_as_json;
 744 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 745 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
 746 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
 747 foreach my $idx ( 0 .. $#text ) {
 748         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
 749 }
 750
 751 my @ctext = qw( when april with his showers sweet with fruit the drought of march
 752                                 has pierced unto the root );
 753 $trad = Text::Tradition->new(
 754         'input' => 'CollateX',
 755         'file' => 't/data/Collatex-16.xml' );
 756
 757 $jsonstruct = $trad->witness('A')->export_as_json;
 758 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
 759 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
 760 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
 761 foreach my $idx ( 0 .. $#ctext ) {
 762         is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
 763 }
 764
 765 ## TODO test layertext export
 766
 767 =end testing
 768
 769 =cut
 770
 771 sub export_as_json {
 772         my $self = shift;
 773         my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
 774         my $obj =  {
 775                 'id' => $self->sigil,
 776                 'tokens' => \@wordlist,
 777                 'name' => $self->identifier,
 778         };
 779         if( $self->is_layered ) {
 780                 my @lwlist = map { { 't' => $_ || '' } } @{$self->layertext};
 781                 $obj->{'layertokens'} = \@lwlist;
 782         }
 783         return $obj;
 784 }
 785
 786 no Moose;
 787 __PACKAGE__->meta->make_immutable;
 788
 789 =head1 BUGS / TODO
 790
 791 =over
 792
 793 =item * Figure out how to serialize a witness
 794
 795 =item * Support encodings other than UTF-8
 796
 797 =back
 798
 799 =head1 LICENSE
 800
 801 This package is free software and is provided "as is" without express
 802 or implied warranty.  You can redistribute it and/or modify it under
 803 the same terms as Perl itself.
 804
 805 =head1 AUTHOR
 806
 807 Tara L Andrews E<lt>aurum@cpan.orgE<gt>