From: Tara L Andrews Date: Sun, 15 Apr 2012 21:15:18 +0000 (+0200) Subject: make witness plaintext parsing work X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fae52efdaeb4b67f58e47ec8ebe110537f2535ea;p=scpubgit%2Fstemmatology.git make witness plaintext parsing work --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 0bbc76b..5f8ad83 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -1,5 +1,6 @@ package Text::Tradition; +use JSON qw / decode_json /; use Module::Load; use Moose; use Text::Tradition::Collation; @@ -37,6 +38,7 @@ has 'name' => ( has 'language' => ( is => 'rw', isa => 'Str', + predicate => 'has_language', ); has 'stemmata' => ( @@ -57,7 +59,11 @@ around 'add_witness' => sub { my $orig = shift; my $self = shift; # TODO allow add of a Witness object? - my $new_wit = Text::Tradition::Witness->new( @_ ); + my %args = @_ == 1 ? %{$_[0]} : @_; + $args{'tradition'} = $self; + $args{'language'} = $self->language + if( $self->language && !exists $args{'language'} ); + my $new_wit = Text::Tradition::Witness->new( %args ); $self->$orig( $new_wit->sigil => $new_wit ); return $new_wit; }; @@ -238,34 +244,14 @@ is( scalar $s->witnesses, 3, "object has three witnesses again" ); sub BUILD { my( $self, $init_args ) = @_; + + # First, make a collation object. This will use only those arguments in + # init_args that apply to the collation. + my $collation = Text::Tradition::Collation->new( %$init_args, + 'tradition' => $self ); + $self->_save_collation( $collation ); - if( exists $init_args->{'witnesses'} ) { - # We got passed an uncollated list of witnesses. Make a - # witness object for each witness, and then send them to the - # collator. - my $autosigil = 0; - foreach my $wit ( %{$init_args->{'witnesses'}} ) { - # Each item in the list is either a string or an arrayref. - # If it's a string, it is a filename; if it's an arrayref, - # it is a tuple of 'sigil, file'. Handle either case. - my $args; - if( ref( $wit ) eq 'ARRAY' ) { - $args = { 'sigil' => $wit->[0], - 'file' => $wit->[1] }; - } else { - $args = { 'sigil' => chr( $autosigil+65 ), - 'file' => $wit }; - $autosigil++; - } - $self->witnesses->add_witness( $args ); - # TODO Now how to collate these? - } - } else { - # Else we need to parse some collation data. Make a Collation object - my $collation = Text::Tradition::Collation->new( %$init_args, - 'tradition' => $self ); - $self->_save_collation( $collation ); - + if( exists $init_args->{'input'} ) { # Call the appropriate parser on the given data my @format_standalone = qw/ Self CollateText CollateX CTE JSON TEI Tabular /; my @format_basetext = qw/ KUL /; @@ -295,6 +281,26 @@ sub BUILD { $mod->can('parse')->( $self, $init_args ); } } + return $self; +} + +=head2 add_json_witnesses( $jsonstring, $options ) + +Adds a set of witnesses from a JSON array specification. This is a wrapper +to parse the JSON and call add_witness (with the specified $options) for +each element therein. + +=cut + +sub add_json_witnesses { + my( $self, $jsonstr, $extraopts ) = @_; + my $witarray = decode_json( $jsonstr ); + foreach my $witspec ( @$witarray ) { + my $opts = $extraopts || {}; + $opts->{'sourcetype'} = 'json'; + $opts->{'object'} = $witspec; + $self->add_witness( $opts ); + } } =head2 add_stemma( $dotfile ) diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 6357a6f..4f7f571 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -1325,7 +1325,7 @@ sub common_readings { return @common; } -=head2 path_text( $sigil, $mainsigil [, $start, $end ] ) +=head2 path_text( $sigil, [, $start, $end ] ) Returns the text of a witness (plus its backup, if we are using a layer) as stored in the collation. The text is returned as a string, where the @@ -1387,11 +1387,17 @@ sub make_witness_path { my( $self, $wit ) = @_; my @chain = @{$wit->path}; my $sig = $wit->sigil; + # Add start and end if necessary + unshift( @chain, $self->start ) unless $chain[0] eq $self->start; + push( @chain, $self->end ) unless $chain[-1] eq $self->end; + $DB::single = 1; foreach my $idx ( 0 .. $#chain-1 ) { $self->add_path( $chain[$idx], $chain[$idx+1], $sig ); } if( $wit->is_layered ) { @chain = @{$wit->uncorrected_path}; + unshift( @chain, $self->start ) unless $chain[0] eq $self->start; + push( @chain, $self->end ) unless $chain[-1] eq $self->end; foreach my $idx( 0 .. $#chain-1 ) { my $source = $chain[$idx]; my $target = $chain[$idx+1]; diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 20b0990..d2d1fb7 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -82,6 +82,12 @@ has 'text' => ( writer => 'alter_text', ); +has 'language' => ( + is => 'ro', + isa => 'Str', + default => 'Default', + ); + has 'is_start' => ( is => 'ro', isa => 'Bool', diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 68948d5..3521d9f 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -126,7 +126,8 @@ sub parse { # Create the witness objects if they does not yet exist. foreach my $wit ( split( /, /, $e->{$WITKEY} ) ) { unless( $tradition->witness( $wit ) ) { - $tradition->add_witness( 'sigil' => $wit ); + $tradition->add_witness( + 'sigil' => $wit, 'sourcetype' => 'collation' ); } $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); } diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index f1dd5be..4103651 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -1,6 +1,12 @@ package Text::Tradition::Witness; + +use vars qw( %tags ); +use JSON; use Moose; use Moose::Util::TypeConstraints; +use Text::TEI::Markup qw( word_tag_wrap ); +use TryCatch; +use XML::Easy::Syntax qw( $xml10_name_rx ); =head1 NAME @@ -31,36 +37,79 @@ Create a new witness. Options include: =item * sigil - A short code to represent the manuscript. Required. -=item * text - An array of strings (words) that contains the text of the -manuscript. This should not change after the witness has been instantiated, -and the path through the collation should always match it. +=item * sourcetype - What sort of witness data this is. Options are +'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be +used by Collation parsers.) -=item * layertext - An array of strings (words) that contains the layered text, -if any, of the manuscript. This should not change after the witness has been -instantiated, and the path through the collation should always match it. +=item * file +=item * string +=item * object + +The data source for the witness. Use the appropriate option. + +=item * use_text - An initialization option. If the witness is read from a +TEI document and more than one tag exists therein, the default +behavior is to use the first defined text. If this is not desired, +use_text should be set to an XPath expression that will select the correct +text. -=item * source - A reference to the text, such as a filename, if it is not -given in the 'text' option. +=item * language - The name of the applicable L +module for language handling. Usually inherited from the language set in +the L object, and defaults to Default. =item * identifier - The recognized name of the manuscript, e.g. a library -identifier. +identifier. Taken from the msDesc element for a TEI file. =item * other_info - A freeform string for any other description of the -manuscript. +manuscript. =back =head2 sigil -Accessor method for the witness sigil. +The sigil by which to identify this manuscript, which must conform to the +specification for XML attribute strings (broadly speaking, it must begin +with a letter and can have only a few sorts of punctuation characters in +it.) + +=head2 identifier + +A freeform name by which to identify the manuscript, which may be longer +than the sigil. Defaults to 'Unidentified ms', but will be taken from the +TEI msName attribute, or constructed from the settlement and idno if +supplied. + +=head2 settlement + +The city, town, etc. where the manuscript is held. Will be read from the +TEI msDesc element if supplied. + +=head2 repository + +The institution that holds the manuscript. Will be read from the TEI msDesc +element if supplied. + +=head2 idno + +The identification or call number of the manuscript. Will be read from the +TEI msDesc element if supplied. =head2 text -Accessor method to get and set the text array. +An array of strings (words) that contains the text of the +manuscript. This should not change after the witness has been +instantiated, and the path through the collation should always match it. + +=head2 layertext -=head2 source +An array of strings (words) that contains the layered +text, if any, of the manuscript. This should not change after the witness +has been instantiated, and the path through the collation should always +match it. -Accessor method to get and set the text source. +=head2 language + +Accessor method to get the witness language. =head2 identifier @@ -77,28 +126,143 @@ readings) in the collation. =begin testing -use_ok( 'Text::Tradition::Witness', "can use module" ); +use Text::Tradition; +my $trad = Text::Tradition->new( 'name' => 'test tradition' ); +my $c = $trad->collation; -my @text = qw( This is a line of text ); -my $wit = Text::Tradition::Witness->new( +# Test a plaintext witness via string +my $str = 'This is a line of text'; +my $ptwit = $trad->add_witness( 'sigil' => 'A', - 'text' => \@text, + 'sourcetype' => 'plaintext', + 'string' => $str ); -is( ref( $wit ), 'Text::Tradition::Witness', 'Created a witness' ); -if( $wit ) { - is( $wit->sigil, 'A', "Witness has correct sigil" ); - is( join( ' ', @{$wit->text} ), join( ' ', @text ), "Witness has correct text" ); +is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' ); +if( $ptwit ) { + is( $ptwit->sigil, 'A', "Witness has correct sigil" ); + is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" ); } +# # Test some JSON witnesses via object +# open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input"; +# binmode( JSIN, ':encoding(UTF-8)' ); +# my @lines = ; +# close JSIN; +# $trad->add_json_witnesses( join( '', @lines ) ); +# is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', +# "Found first JSON witness" ); +# is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', +# "Found second JSON witness" ); +# +# # Test an XML witness via file +# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', +# 'file' => 't/data/witnesses/teiwit.xml' ); +# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); +# if( $xmlwit ) { +# is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); +# ok( $xmlwit->is_layered, "Picked up correction layer" ); +# is( @{$xmlwit->path}, 185, "Got correct text length" ); +# is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" ); +# } + +## Test use_text + =end testing =cut -# Sigil. Required identifier for a witness. +subtype 'SourceType', + as 'Str', + where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ }, + message { 'Source type must be one of xmldesc, plaintext, json, collation' }; + +subtype 'Sigil', + as 'Str', + where { $_ =~ /\A$xml10_name_rx\z/ }, + message { 'Sigil must be a valid XML attribute string' }; + +no Moose::Util::TypeConstraints; + +has 'tradition' => ( + 'is' => 'ro', + 'isa' => 'Text::Tradition', + 'required' => 1, + ); + +# Sigil. Required identifier for a witness, but may be found inside +# the XML file. has 'sigil' => ( is => 'ro', + isa => 'Sigil', + predicate => 'has_sigil', + writer => '_set_sigil', + ); + +# Other identifying information +has 'identifier' => ( + is => 'rw', + isa => 'Str', + ); + +has 'settlement' => ( + is => 'rw', + isa => 'Str', + ); + +has 'repository' => ( + is => 'rw', + isa => 'Str', + ); + +has 'idno' => ( + is => 'rw', + isa => 'Str', + ); + +has 'sourcetype' => ( + is => 'ro', + isa => 'SourceType', + required => 1, +); + +has 'language' => ( + is => 'ro', + isa => 'Str', + default => 'Default', + ); + +# Source. Can be XML obj, JSON data struct, or string. +# Not used if the witness is created by parsing a collation. +has 'file' => ( + is => 'ro', + isa => 'Str', + predicate => 'has_file', +); + +has 'string' => ( + is => 'ro', + isa => 'Str', + predicate => 'has_string', +); + +has 'object' => ( # could be anything. + is => 'ro', + predicate => 'has_object', + clearer => 'clear_object', +); + +# In the case of a TEI document with multiple texts, specify +# which text is the root. Should be an XPath expression. +has 'use_text' => ( + is => 'ro', isa => 'Str', - required => 1, + ); + +has 'msdesc' => ( # if we started with a TEI doc + is => 'ro', + isa => 'XML::LibXML::Element', + predicate => 'has_msdesc', + writer => '_save_msdesc', ); # Text. This is an array of strings (i.e. word tokens). @@ -115,15 +279,7 @@ has 'layertext' => ( isa => 'ArrayRef[Str]', predicate => 'has_layertext', ); - -# Source. This is where we read in the witness, if not from a -# pre-prepared collation. It is probably a filename. -has 'source' => ( - is => 'ro', - isa => 'Str', - predicate => 'has_source', - ); - + # Path. This is an array of Reading nodes that can be saved during # initialization, but should be cleared before saving in a DB. has 'path' => ( @@ -144,18 +300,6 @@ has 'is_layered' => ( isa => 'Bool', ); -# Manuscript name or similar -has 'identifier' => ( - is => 'ro', - isa => 'Str', - ); - -# Any other info we have -has 'other_info' => ( - is => 'ro', - isa => 'Str', - ); - # If we set an uncorrected path, ever, remember that we did so. around 'uncorrected_path' => sub { my $orig = shift; @@ -168,23 +312,372 @@ around 'uncorrected_path' => sub { sub BUILD { my $self = shift; if( $self->has_source ) { - # Read the file and initialize the text. - my $rc; - eval { no warnings; $rc = open( WITNESS, $self->source ); }; - # If we didn't open a file, assume it is a string. - if( $rc ) { - my @words; - while() { - chomp; - push( @words, split( /\s+/, $_ ) ); + my $init_sub = '_init_from_' . $self->sourcetype; + $self->$init_sub(); + # Remove our XML / source objects; we no longer need them. + $self->clear_object if $self->has_object; + $self->tradition->collation->make_witness_path( $self ); + } + return $self; +} + +sub has_source { + my $self = shift; + return $self->has_file || $self->has_string || $self->has_object; +} + +sub _init_from_xmldesc { + my $self = shift; + my $xmlobj; + if( $self->has_object ) { + unless( ref( $self->object ) eq 'XML::LibXML::Element' ) { + throw( ident => "bad source", + message => "Source object must be an XML::LibXML::Element (this is " + . ref( $self->object ) . ");" ); + } + $xmlobj = $self->object; + } else { + my $parser = XML::LibXML->new(); + my $parsersub = $self->has_file ? 'parse_file' : 'parse_string'; + try { + $xmlobj = $parser->$parsersub( $self->file )->documentElement; + } catch( XML::LibXML::Error $e ) { + throw( ident => "bad source", + message => "XML parsing error: " . $e->as_string ); + } + } + + unless( $xmlobj->nodeName eq 'TEI' ) { + throw( ident => "bad source", + message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" ); + } + + # Set up the tags we need, with or without namespaces. + map { $tags{$_} = "//$_" } + qw/ msDesc msName settlement repository idno p lg w seg add del /; + # Set up our XPath object + my $xpc = _xpc_for_el( $xmlobj ); + # Use namespace-aware tags if we have to + if( $xmlobj->namespaceURI ) { + map { $tags{$_} = "//tei:$_" } keys %tags; + } + + # Get the identifier + if( my $desc = $xpc->find( $tags{msDesc} ) ) { + my $descnode = $desc->get_node(1); + $self->_save_msdesc( $descnode ); + # First try to use settlement/repository/idno. + my( $setNode, $reposNode, $idNode ) = + ( $xpc->find( $tags{settlement}, $descnode )->get_node(1), + $xpc->find( $tags{repository}, $descnode )->get_node(1), + $xpc->find( $tags{idno}, $descnode )->get_node(1) ); + $self->settlement( $setNode ? $setNode->textContent : '' ); + $self->repository( $reposNode ? $reposNode->textContent : '' ); + $self->idno( $idNode ? $idNode->textContent : '' ); + if( $self->settlement && $self->idno ) { + $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) ); + } else { + # Look for an msName. + my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1); + if( $msNameNode ) { + $self->identifier( $msNameNode->textContent ); + } else { + # We have an msDesc but who knows what is in it? + my $desc = $descnode->textContent; + $desc =~ s/\n/ /gs; + $desc =~ s/\s+/ /g; + $self->identifier( $desc ); + } + } + if( $descnode->hasAttribute('xml:id') ) { + $self->_set_sigil( $descnode->getAttribute('xml:id') ); + } elsif( !$self->has_sigil ) { + throw( 'Could not find xml:id witness sigil' ); + } + } else { + throw( ident => "bad source", + message => "Could not find manuscript description element in TEI header" ); + } + + # Now get the words out. + my @words; + my @layerwords; # if the witness has layers + # First, make sure all the words are wrapped in tags. + # TODO Make this not necessarily dependent upon whitespace... + word_tag_wrap( $xmlobj ); + # Now go text hunting. + my @textnodes; + if( $self->use_text ) { + @textnodes = $xpc->findnodes( $self->use_text ); + } else { + # Use the first 'text' node in the document. + @textnodes = $xmlobj->getElementsByTagName( 'text' ); + } + my $teitext = $textnodes[0]; + if( $teitext ) { + _tokenize_text( $self, $teitext, \@words, \@layerwords ); + } else { + throw( ident => "bad source", + message => "No text element in document '" . $self->{'identifier'} . "!" ); + } + + $self->path( \@words ); + my $a = join( ' ', map { $_->text } @words ); + my $b = join( ' ', map { $_->text } @layerwords ); + if( $a ne $b ) { + $self->uncorrected_path( \@layerwords ); + } + # TODO set self->text +} + +sub _tokenize_text { + my( $self, $teitext, $wordlist, $uncorrlist ) = @_; + # Strip out the words. + my $xpc = _xpc_for_el( $teitext ); + my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' ); + foreach( @divs ) { + my $place_str; + if( my $n = $_->getAttribute( 'n' ) ) { + $place_str = '#DIV_' . $n . '#'; + } else { + $place_str = '#DIV#'; + } + $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str ); + } # foreach
+ + # But maybe we don't have any divs. Just paragraphs. + unless( @divs ) { + $self->_objectify_words( $teitext, $wordlist, $uncorrlist ); + } +} + +sub _objectify_words { + my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_; + + my $xpc = _xpc_for_el( $element ); + my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg}; + my @pgraphs = $xpc->findnodes( $xpexpr ); + return () unless @pgraphs; + # Set up an expression to look for words and segs + $xpexpr = '.' . $tags{w} . '|.' . $tags{seg}; + foreach my $pg ( @pgraphs ) { + # If this paragraph is the descendant of a note element, + # skip it. + my @noop_container = $xpc->findnodes( 'ancestor::note', $pg ); + next if scalar @noop_container; + # Get the text of each node + my $first_word = 1; + # Hunt down each wrapped word/seg, and make an object (or two objects) + # of it, if necessary. + foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) { + my( $text, $uncorr ) = _get_word_object( $c ); +# try { +# ( $text, $uncorr ) = _get_word_object( $c ); +# } catch( Text::Tradition::Error $e +# where { $_->has_tag( 'lb' ) } ) { +# next; +# } + unless( defined $text || defined $uncorr ) { + print STDERR "WARNING: no text in node " . $c->nodeName + . "\n" unless $c->nodeName eq 'lb'; + next; + } + print STDERR "DEBUG: space found in element node " + . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/; + + my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist; + while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) { + $ctr++; + } + my $id = $self->sigil . 'r' . $ctr; + my( $word, $acword ); + if( $text ) { + $word = $self->tradition->collation->add_reading( + { 'id' => $id, 'text' => $text }); + } + if( $uncorr && $uncorr ne $text ) { + $id .= '_ac'; + $acword = $self->tradition->collation->add_reading( + { 'id' => $id, 'text' => $uncorr }); + } elsif( $uncorr ) { + $acword = $word; + } + +# if( $first_word ) { +# $first_word = 0; +# # Set the relevant sectioning markers +# if( $divmarker ) { +# $w->add_placeholder( $divmarker ); +# $divmarker = undef; +# } +# $w->add_placeholder( '#PG#' ); +# } + push( @$wordlist, $word ) if $word; + push( @$uncorrlist, $acword ) if $acword; + } + } +} + +# Given a word or segment node, make a Reading object for the word +# therein. Make two Reading objects if there is an 'uncorrected' vs. +# 'corrected' state. + +sub _get_word_strings { + my( $node ) = @_; + my( $text, $uncorrtext ); + # We can have an lb or pb in the middle of a word; if we do, the + # whitespace (including \n) after the break becomes insignificant + # and we want to nuke it. + my $strip_leading_space = 0; + my $word_excluded = 0; + my $xpc = _xpc_for_el( $node ); + # TODO This does not cope with nested add/dels. + my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} ); + my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} ); + foreach my $c ($node->childNodes() ) { + if( $c->nodeName eq 'num' + && defined $c->getAttribute( 'value' ) ) { + # Push the number. + $text .= $c->getAttribute( 'value' ) unless @deletion; + $uncorrtext .= $c->getAttribute( 'value' ) unless @addition; + # If this is just after a line/page break, return to normal behavior. + $strip_leading_space = 0; + } elsif ( $c->nodeName =~ /^[lp]b$/ ) { + # Set a flag that strips leading whitespace until we + # get to the next bit of non-whitespace. + $strip_leading_space = 1; + } elsif ( $c->nodeName eq 'fw' # for catchwords + || $c->nodeName eq 'sic' + || $c->nodeName eq 'note' #TODO: decide how to deal with notes + || $c->textContent eq '' + || ref( $c ) eq 'XML::LibXML::Comment' ) { + $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/; + next; + } elsif( $c->nodeName eq 'add' ) { + my( $use, $discard ) = _get_text_from_node( $c ); + $text .= $use; + } elsif( $c->nodeName eq 'del' ) { + my( $discard, $use ) = _get_text_from_node( $c ); + $uncorrtext .= $use; + } else { + my $tagtxt; + if( ref( $c ) eq 'XML::LibXML::Text' ) { + # A text node. + $tagtxt = $c->textContent; + } else { + $tagtxt = _get_text_from_node( $c ); + } + if( $strip_leading_space ) { + $tagtxt =~ s/^[\s\n]+//s; + # Unset the flag as soon as we see non-whitespace. + $strip_leading_space = 0 if $tagtxt; + } + $text .= $tagtxt; + $uncorrtext .= $tagtxt; + } + } + throw( ident => "text not found", + tags => [ $node->nodeName ], + message => "No text found in node " . $node->toString(0) ) + unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/; + return( $text, $uncorrtext ); +} + +sub _split_words { + my( $self, $string, $c ) = @_; + my @raw_words = split( /\s+/, $string ); + my @words; + foreach my $w ( @raw_words ) { + my $id = $self->sigil . 'r'. $c++; + my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language ); + my $w_obj = $self->tradition->collation->add_reading( \%opts ); + # Skip any words that have been canonized out of existence. + next if( length( $w_obj->text ) == 0 ); + push( @words, $w_obj ); + } + return @words; +} + +sub _init_from_json { + my( $self ) = shift; + my $wit; + if( $self->has_object ) { + $wit = $self->object; + } else { + + } + + $self->sigil( $wit->{'id'} ); + $self->identifier( $wit->{'name'} ); + my @words; + my @layerwords; + if( exists $wit->{'content'} ) { + # We need to tokenize the text ourselves. + @words = _split_words( $self, $wit->{'content'} ); + } elsif( exists $wit->{'tokens'} ) { + # We have a bunch of pretokenized words. + my $ctr = 0; + foreach my $token ( @{$wit->{'tokens'}} ) { + my $w_obj = $self->tradition->collation->add_reading({ + 'text' => $token, 'id' => $self->sigil . 'r' . $ctr++ }); + push( @words, $w_obj ); + } + ## TODO rethink this JSOn mechanism + if( exists $wit->{'layertokens'} ) { + foreach my $token ( @{$wit->{'layertokens'}} ) { + my $w_obj = $self->tradition->collation->add_reading({ + 'text' => $token, 'id' => $self->sigil . 'r' . $ctr++ }); + push( @layerwords, $w_obj ); } - close WITNESS; - $self->text( \@words ); - } # else the text is in the source string, probably - # XML, and we are doing nothing with it. + } } + # TODO set self->text + $self->path( \@words ); + $self->uncorrected_path( \@layerwords ) if @layerwords; } +sub _init_from_plaintext { + my( $self ) = @_; + my $str; + if( $self->has_file ) { + my $ok = open( INPUT, $self->file ); + unless( $ok ) { + throw( ident => "bad source", + message => 'Could not open ' . $self->file . ' for reading' ); + } + binmode( INPUT, ':encoding(UTF-8)' ); + my @lines = ; + close INPUT; + $str = join( '', @lines ); + } elsif( $self->has_object ) { # ...seriously? + $str = ${$self->object}; + } else { + $str = $self->string; + } + + # TODO allow a different word separation expression + my @text = split( /\s+/, $str ); + $self->text( \@text ); + my @words = _split_words( $self, $str ); + $self->path( \@words ); +} + +sub throw { + Text::Tradition::Error->throw( + 'ident' => 'Witness parsing error', + 'message' => $_[0], + ); +} + +sub _xpc_for_el { + my $el = shift; + my $xpc = XML::LibXML::XPathContext->new( $el ); + if( $el->namespaceURI ) { + $xpc->registerNs( 'tei', $el->namespaceURI ); + } + return $xpc; +} + =head2 export_as_json Exports the witness as a JSON structure, with the following keys: @@ -202,11 +695,13 @@ Exports the witness as a JSON structure, with the following keys: =begin testing use Text::Tradition; +my $trad = Text::Tradition->new(); -my @text = qw( This is a line of text ); -my $wit = Text::Tradition::Witness->new( +my @text = qw/ Thhis is a line of text /; +my $wit = $trad->add_witness( 'sigil' => 'A', - 'text' => \@text, + 'string' => join( ' ', @text ), + 'sourcetype' => 'plaintext', 'identifier' => 'test witness', ); my $jsonstruct = $wit->export_as_json; @@ -219,7 +714,7 @@ foreach my $idx ( 0 .. $#text ) { my @ctext = qw( when april with his showers sweet with fruit the drought of march has pierced unto the root ); -my $trad = Text::Tradition->new( +$trad = Text::Tradition->new( 'input' => 'CollateX', 'file' => 't/data/Collatex-16.xml' ); @@ -231,6 +726,8 @@ foreach my $idx ( 0 .. $#ctext ) { is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" ); } +## TODO test layertext export + =end testing =cut @@ -238,11 +735,16 @@ foreach my $idx ( 0 .. $#ctext ) { sub export_as_json { my $self = shift; my @wordlist = map { { 't' => $_ || '' } } @{$self->text}; - return { + my $obj = { 'id' => $self->sigil, 'tokens' => \@wordlist, 'name' => $self->identifier, }; + if( $self->is_layered ) { + my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected}; + $obj->{'layertokens'} = \@lwlist; + } + return $obj; } no Moose; @@ -252,9 +754,7 @@ __PACKAGE__->meta->make_immutable; =over -=item * Get rid of either text or path, as they are redundant. - -=item * Re-think the mechanism for pre-correction readings etc. +=item * Support encodings other than UTF-8 =back diff --git a/t/data/witnesses/teiwit.xml b/t/data/witnesses/teiwit.xml new file mode 100644 index 0000000..cc30d0c --- /dev/null +++ b/t/data/witnesses/teiwit.xml @@ -0,0 +1,51 @@ + + + + + + + Ժամանակագրութիւն + Մատթէոս Ուռհայեցի + + Transcription by + Tara L Andrews + + + +

Unpublished manuscript

+
+ + + + Venice + Mekhitarist Library + 887 + +

pp. 160-177

+
+
+
+
+ + +

+Արդ՝ մինչև ցայս վայ՛րս բազմաջան և աշխատաւոր +քննութեամբ գտեալ գրեցաք զշարագրական գրեալս զՃից ամաց` զորս ի բազում ժամանակաց հետաքննեալ հասու եղա՞ք։ Ընդ այնքանեաց տեսողացն և լս +ողացն որք էին ի հին ամաց ծնեալք, և ընդ ընթերցողսն յառաջին պատմագր +ացն որք ականատեսք էին լեա՛լք ամենայն եղելոցս, և նեղութեանցս այսոցիկ +զոր վասն մեղաց կրեաց տունն հայո՛ց և ահաւոր զայս բազում անգամ զմտաւ +ածեալ եմ վասն այս յետին ժամանակին գրել զդառնաշունչ կտրծսն։

+

Իսկ ի լինել թվականութեանս հայոց ի յամս ԵՃԲ եղև ահ +աւոր նշան և սոսկալի, և կտրծ մեծ ի բարկութենէ, երևեալ ի մեծ քաղաքն անտիոք, զոր ի ներքոյ արեգականն գործեցաւ՛ այս սքանչելիս որ է՛ր տես +ողացն ահաւոր և հրաշալի. և եղև այս իշխնշանս ա՛հ և դողումն ամենայն հաւատաց +ելոց քրիստոսի, զորս յայնժամ մեծասաստ սպառնալեօք յայտնեաց աստուած զդատաստան +իւր զահաւոր. և իշխ +անն զհաւատսն եթո՛ղ, և եղև հակառակ մեծ ազգին ասորո՛ց. և յայնժամ մատն +եցան ասորիքն ի մեծ նեղութիւն, վասն սկսան հաւատոյ քննութիւն առնե՛լ հանապազօր. +և յայնժամ լրբեալ եղեն ազգն հոռոմո՛ց, որ և զգործեալն ոչ գիտե՛լ, վասն զի զքրիստոս +ական աւետարանն հրով այրե՛լ հրամայեաց պատրիարգն. և եղև յորժամ ի հուրն +դրին զաւետարանն աստուծոյ, ձայն ելանէ՛ր յաւետարանէն անտի և ի կրակէն +ի դուրս ելանէ՛ր.

+ +
+
diff --git a/t/data/witnesses/testwit.json b/t/data/witnesses/testwit.json new file mode 100644 index 0000000..2e69fa7 --- /dev/null +++ b/t/data/witnesses/testwit.json @@ -0,0 +1 @@ +{"witnesses":[{"tokens":[{"n":"Արդ","c":"Արդ՝","punctuation":[{"char":"՝","pos":3}],"t":"Արդ","placeholders":["__P__"]},{"n":"մինչև","c":"մինչև","t":"մինչև"},{"n":"ցայս","c":"ցայս","t":"ցայս"},{"n":"վայրս","c":"վայ՛րս","punctuation":[{"char":"՛","pos":3}],"t":"վայրս"},{"n":"բազմաջան","c":"բազմաջան","t":"բազմաջան"},{"n":"և","c":"և","t":"և"},{"n":"աշխատաւոր","c":"աշխատաւոր","t":"աշխատաւոր"},{"n":"քննութեամբ","c":"քննութեամբ","t":"քննութեամբ"},{"n":"գտեալ","c":"գտեալ","t":"գտեալ"},{"n":"գրեցաք","c":"գրեցաք","t":"գրեցաք"},{"n":"զշարագրական","c":"զշարագրական","t":"զշարագրական"},{"n":"գրեալս","c":"գրեալս","t":"գրեալս"},{"n":"զ100ից","c":"զ100ից","t":"զ100ից"},{"n":"ամաց","c":"ամաց`","punctuation":[{"char":"`","pos":4}],"t":"ամաց"},{"n":"զորս","c":"զորս","t":"զորս"},{"n":"ի","c":"ի","t":"ի"},{"n":"բազում","c":"բազում","t":"բազում"},{"n":"ժամանակաց","c":"ժամանակաց","t":"ժամանակաց"},{"n":"հետաքննեալ","c":"հետաքննեալ","t":"հետաքննեալ"},{"n":"հասու","c":"հասու","t":"հասու"},{"n":"եղաք","c":"եղա՞ք։","punctuation":[{"char":"՞","pos":3},{"char":"։","pos":5}],"t":"եղաք"},{"n":"Ընդ","c":"Ընդ","t":"Ընդ"},{"n":"այնքանեաց","c":"այնքանեաց","t":"այնքանեաց"},{"n":"տեսողացն","c":"տեսողացն","t":"տեսողացն"},{"n":"և","c":"և","t":"և"},{"n":"լսողացն","c":"լսողացն","t":"լսողացն"},{"n":"որք","c":"որք","t":"որք"},{"n":"էին","c":"էին","t":"էին"},{"n":"ի","c":"ի","t":"ի"},{"n":"հին","c":"հին","t":"հին"},{"n":"ամաց","c":"ամաց","t":"ամաց"},{"n":"ծնեալք","c":"ծնեալք,","punctuation":[{"char":",","pos":6}],"t":"ծնեալք"},{"n":"և","c":"և","t":"և"},{"n":"ընդ","c":"ընդ","t":"ընդ"},{"n":"ընթերցողսն","c":"ընթերցողսն","t":"ընթերցողսն"},{"n":"յառաջին","c":"յառաջին","t":"յառաջին"},{"n":"պատմագրացն","c":"պատմագրացն","t":"պատմագրացն"},{"n":"որք","c":"որք","t":"որք"},{"n":"ականատեսք","c":"ականատեսք","t":"ականատեսք"},{"n":"էին","c":"էին","t":"էին"},{"n":"լեալք","c":"լեա՛լք","punctuation":[{"char":"՛","pos":3}],"t":"լեալք"},{"n":"ամենայն","c":"ամենայն","t":"ամենայն"},{"n":"եղելոցս","c":"եղելոցս,","punctuation":[{"char":",","pos":7}],"t":"եղելոցս"},{"n":"և","c":"և","t":"և"},{"n":"նեղութեանցս","c":"նեղութեանցս","t":"նեղութեանցս"},{"n":"այսոցիկ","c":"այսոցիկ","t":"այսոցիկ"},{"n":"զոր","c":"զոր","t":"զոր"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"մեղաց","c":"մեղաց","t":"մեղաց"},{"n":"կրեաց","c":"կրեաց","t":"կրեաց"},{"n":"տունն","c":"տունն","t":"տունն"},{"n":"հայոց","c":"հայո՛ց","punctuation":[{"char":"՛","pos":4}],"t":"հայոց"},{"n":"և","c":"և","t":"և"},{"n":"ահաւոր","c":"ահաւոր","t":"ահաւոր"},{"n":"զայս","c":"զայս","t":"զայս"},{"n":"բազում","c":"բազում","t":"բազում"},{"n":"անգամ","c":"անգամ","t":"անգամ"},{"n":"զմտաւ","c":"զմտաւ","t":"զմտաւ"},{"n":"ածեալ","c":"ածեալ","t":"ածեալ"},{"n":"եմ","c":"եմ","t":"եմ"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"այս","c":"այս","t":"այս"},{"n":"յետին","c":"յետին","t":"յետին"},{"n":"ժամանակին","c":"ժամանակին","t":"ժամանակին"},{"n":"գրել","c":"գրել","t":"գրել"},{"n":"զդառնաշունչ","c":"զդառնաշունչ","t":"զդառնաշունչ"},{"n":"կտրծսն","c":"կտրծսն։","punctuation":[{"char":"։","pos":6}],"t":"կտրծսն"},{"n":"Իսկ","c":"Իսկ","t":"Իսկ","placeholders":["__P__"]},{"n":"ի","c":"ի","t":"ի"},{"n":"լինել","c":"լինել","t":"լինել"},{"n":"թվականութեանս","c":"թվականութեանս","t":"թվականութեանս"},{"n":"հայոց","c":"հայոց","t":"հայոց"},{"n":"ի","c":"ի","t":"ի"},{"n":"յամս","c":"յամս","t":"յամս"},{"n":"502","c":"502","t":"502"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"ահաւոր","c":"ահաւոր","t":"ահաւոր"},{"n":"նշան","c":"նշան","t":"նշան"},{"n":"և","c":"և","t":"և"},{"n":"սոսկալի","c":"սոսկալի,","punctuation":[{"char":",","pos":7}],"t":"սոսկալի"},{"n":"և","c":"և","t":"և"},{"n":"կտրծ","c":"կտրծ","t":"կտրծ"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"ի","c":"ի","t":"ի"},{"n":"բարկութենէ","c":"բարկութենէ,","punctuation":[{"char":",","pos":10}],"t":"բարկութենէ"},{"n":"երևեալ","c":"երևեալ","t":"երևեալ"},{"n":"ի","c":"ի","t":"ի"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"քաղաքն","c":"քաղաքն","t":"քաղաքն"},{"n":"անտիոք","c":"անտիոք,","punctuation":[{"char":",","pos":6}],"t":"անտիոք"},{"n":"զոր","c":"զոր","t":"զոր"},{"n":"ի","c":"ի","t":"ի"},{"n":"ներքոյ","c":"ներքոյ","t":"ներքոյ"},{"n":"արեգականն","c":"արեգականն","t":"արեգականն"},{"n":"գործեցաւ","c":"գործեցաւ՛","punctuation":[{"char":"՛","pos":8}],"t":"գործեցաւ"},{"n":"այս","c":"այս","t":"այս"},{"n":"սքանչելիս","c":"սքանչելիս","t":"սքանչելիս"},{"n":"որ","c":"որ","t":"որ"},{"n":"էր","c":"է՛ր","punctuation":[{"char":"՛","pos":1}],"t":"էր"},{"n":"տեսողացն","c":"տեսողացն","t":"տեսողացն"},{"n":"ահաւոր","c":"ահաւոր","t":"ահաւոր"},{"n":"և","c":"և","t":"և"},{"n":"հրաշալի","c":"հրաշալի.","punctuation":[{"char":".","pos":7}],"t":"հրաշալի"},{"n":"և","c":"և","t":"և"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"այս","c":"այս","t":"այս"},{"n":"նշանս","c":"նշանս","t":"նշանս"},{"n":"ահ","c":"ա՛հ","punctuation":[{"char":"՛","pos":1}],"t":"ահ"},{"n":"և","c":"և","t":"և"},{"n":"դողումն","c":"դողումն","t":"դողումն"},{"n":"ամենայն","c":"ամենայն","t":"ամենայն"},{"n":"հաւատացելոց","c":"հաւատացելոց","t":"հաւատացելոց"},{"n":"քրիստոսի","c":"քրիստոսի,","punctuation":[{"char":",","pos":8}],"t":"քրիստոսի"},{"n":"զորս","c":"զորս","t":"զորս"},{"n":"այժմ","c":"այժմ","t":"այժմ"},{"n":"մեծասաստ","c":"մեծասաստ","t":"մեծասաստ"},{"n":"սպառնալեօք","c":"սպառնալեօք","t":"սպառնալեօք"},{"n":"յայտնեաց","c":"յայտնեաց","t":"յայտնեաց"},{"n":"աստուած","c":"աստուած","t":"աստուած"},{"n":"զդատաստան","c":"զդատաստան","t":"զդատաստան"},{"n":"իւր","c":"իւր","t":"իւր"},{"n":"զահաւոր","c":"զահաւոր.","punctuation":[{"char":".","pos":7}],"t":"զահաւոր"},{"n":"և","c":"և","t":"և"},{"n":"իշխանն","c":"իշխանն","t":"իշխանն"},{"n":"զհաւատսն","c":"զհաւատսն","t":"զհաւատսն"},{"n":"եթող","c":"եթո՛ղ,","punctuation":[{"char":"՛","pos":3},{"char":",","pos":5}],"t":"եթող"},{"n":"և","c":"և","t":"և"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"հակառակ","c":"հակառակ","t":"հակառակ"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"ազգին","c":"ազգին","t":"ազգին"},{"n":"ասորոց","c":"ասորո՛ց.","punctuation":[{"char":"՛","pos":5},{"char":".","pos":7}],"t":"ասորոց"},{"n":"և","c":"և","t":"և"},{"n":"յայնժամ","c":"յայնժամ","t":"յայնժամ"},{"n":"մատնեցան","c":"մատնեցան","t":"մատնեցան"},{"n":"ասորիքն","c":"ասորիքն","t":"ասորիքն"},{"n":"ի","c":"ի","t":"ի"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"նեղութիւն","c":"նեղութիւն,","punctuation":[{"char":",","pos":9}],"t":"նեղութիւն"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"սկսան","c":"սկսան","t":"սկսան"},{"n":"հաւատոյ","c":"հաւատոյ","t":"հաւատոյ"},{"n":"քննութիւն","c":"քննութիւն","t":"քննութիւն"},{"n":"առնել","c":"առնե՛լ","punctuation":[{"char":"՛","pos":4}],"t":"առնել"},{"n":"հանապազօր","c":"հանապազօր.","punctuation":[{"char":".","pos":9}],"t":"հանապազօր"},{"n":"և","c":"և","t":"և"},{"n":"յայնժամ","c":"յայնժամ","t":"յայնժամ"},{"n":"լրբեալ","c":"լրբեալ","t":"լրբեալ"},{"n":"եղեն","c":"եղեն","t":"եղեն"},{"n":"ազգն","c":"ազգն","t":"ազգն"},{"n":"հոռոմոց","c":"հոռոմո՛ց,","punctuation":[{"char":"՛","pos":6},{"char":",","pos":8}],"t":"հոռոմոց"},{"n":"որ","c":"որ","t":"որ"},{"n":"և","c":"և","t":"և"},{"n":"զգործեալն","c":"զգործեալն","t":"զգործեալն"},{"n":"ոչ","c":"ոչ","t":"ոչ"},{"n":"գիտել","c":"գիտե՛լ,","punctuation":[{"char":"՛","pos":4},{"char":",","pos":6}],"t":"գիտել"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"զի","c":"զի","t":"զի"},{"n":"զքրիստոսական","c":"զքրիստոսական","t":"զքրիստոսական"},{"n":"աւետարանն","c":"աւետարանն","t":"աւետարանն"},{"n":"հրով","c":"հրով","t":"հրով"},{"n":"այրել","c":"այրե՛լ","punctuation":[{"char":"՛","pos":4}],"t":"այրել"},{"n":"հրամայեաց","c":"հրամայեաց","t":"հրամայեաց"},{"n":"պատրիարգն","c":"պատրիարգն.","punctuation":[{"char":".","pos":9}],"t":"պատրիարգն"},{"n":"և","c":"և","t":"և"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"յորժամ","c":"յորժամ","t":"յորժամ"},{"n":"ի","c":"ի","t":"ի"},{"n":"հուրն","c":"հուրն","t":"հուրն"},{"n":"դրին","c":"դրին","t":"դրին"},{"n":"զաւետարանն","c":"զաւետարանն","t":"զաւետարանն"},{"n":"աստուծոյ","c":"աստուծոյ,","punctuation":[{"char":",","pos":8}],"t":"աստուծոյ"},{"n":"ձայն","c":"ձայն","t":"ձայն"},{"n":"ելանէր","c":"ելանէ՛ր","punctuation":[{"char":"՛","pos":5}],"t":"ելանէր"},{"n":"յաւետարանէն","c":"յաւետարանէն","t":"յաւետարանէն"},{"n":"անտի","c":"անտի","t":"անտի"},{"n":"և","c":"և","t":"և"},{"n":"ի","c":"ի","t":"ի"},{"n":"կրակէն","c":"կրակէն","t":"կրակէն"},{"n":"ի","c":"ի","t":"ի"},{"n":"դուրս","c":"դուրս","t":"դուրս"},{"n":"ելանէր","c":"ելանէ՛ր.","punctuation":[{"char":"՛","pos":5},{"char":".","pos":7}],"t":"ելանէր"}],"id":"MsAJ","name":"JSON 1"},{"tokens":[{"n":"Արդ","c":"Արդ","t":"Արդ","placeholders":["__DIV__","__P__"]},{"n":"ահա","c":"ահա՛","punctuation":[{"char":"՛","pos":3}],"t":"ահա"},{"n":"մինչև","c":"մինչև","t":"մինչև"},{"n":"ցայս","c":"ցայս","t":"ցայս"},{"n":"վարս","c":"վարս","t":"վարս"},{"n":"բազմաջան","c":"բազմա՛ջան","punctuation":[{"char":"՛","pos":5}],"t":"բազմաջան"},{"n":"և","c":"և","t":"և"},{"n":"աշխատաւոր","c":"աշխատաւոր","t":"աշխատաւոր"},{"n":"քննութեամբ","c":"քննութեամբ","t":"քննութեամբ"},{"n":"գտեալ","c":"գտեալ","t":"գտեալ"},{"n":"գրեցաք","c":"գրեցաք","t":"գրեցաք"},{"n":"զշարագրական","c":"զշարագրական","t":"զշարագրական"},{"n":"գրեալս","c":"գրեալս,","punctuation":[{"char":",","pos":6}],"t":"գրեալս"},{"n":"զ100ից","c":"զ100ից","t":"զ100ից"},{"n":"ամաց","c":"ամաց,","punctuation":[{"char":",","pos":4}],"t":"ամաց"},{"n":"զոր","c":"զոր","t":"զոր"},{"n":"ի","c":"ի","t":"ի"},{"n":"բազում","c":"բազում","t":"բազում"},{"n":"ժամանակաց","c":"ժամանակաց","t":"ժամանակաց"},{"n":"հետա","c":"հետա՛","punctuation":[{"char":"՛","pos":4}],"t":"հետա"},{"n":"հասու","c":"հասու","t":"հասու"},{"n":"եղաք","c":"եղաք","t":"եղաք"},{"n":"ըստ","c":"ըստ","t":"ըստ"},{"n":"այնքանեացս","c":"այնքանեացս","t":"այնքանեացս"},{"n":"տեսողացն","c":"տեսողացն","t":"տեսողացն"},{"n":"և","c":"և","t":"և"},{"n":"լսողացն","c":"լսողացն,","punctuation":[{"char":",","pos":7}],"t":"լսողացն"},{"n":"որք","c":"որք","t":"որք"},{"n":"էին","c":"էին","t":"էին"},{"n":"ի","c":"ի","t":"ի"},{"n":"հին","c":"հին","t":"հին"},{"n":"ամաց","c":"ամաց","t":"ամաց"},{"n":"ծնեալք","c":"ծնեալք,","punctuation":[{"char":",","pos":6}],"t":"ծնեալք"},{"n":"և","c":"և","t":"և"},{"n":"ընթերցողսն","c":"ընթերցողսն","t":"ընթերցողսն"},{"n":"առաջին","c":"առաջին","t":"առաջին"},{"n":"պատգամագրացն","c":"պատգամագրացն","t":"պատգամագրացն"},{"n":"որք","c":"որք","t":"որք"},{"n":"ականատեսք","c":"ականա՛տեսք","punctuation":[{"char":"՛","pos":5}],"t":"ականատեսք"},{"n":"էին","c":"էին","t":"էին"},{"n":"լեալք","c":"լեալք","t":"լեալք"},{"n":"ամենայն","c":"ամենայն","t":"ամենայն"},{"n":"եղելոցս","c":"եղելոցս,","punctuation":[{"char":",","pos":7}],"t":"եղելոցս"},{"n":"և","c":"և","t":"և"},{"n":"նեղութեանցս","c":"նեղութեանցս,","punctuation":[{"char":",","pos":11}],"t":"նեղութեանցս"},{"n":"յայսոցիկ","c":"յայսոցիկ","t":"յայսոցիկ"},{"n":"զոր","c":"զոր","t":"զոր"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"մեղաց","c":"մեղաց","t":"մեղաց"},{"n":"տուն","c":"տուն","t":"տուն"},{"n":"հաոց","c":"հա՛ոց,","punctuation":[{"char":"՛","pos":2},{"char":",","pos":5}],"t":"հաոց"},{"n":"և","c":"և","t":"և"},{"n":"ահա","c":"ահա՛","punctuation":[{"char":"՛","pos":3}],"t":"ահա"},{"n":"որ","c":"որ","t":"որ"},{"n":"զայս","c":"զայս","t":"զայս"},{"n":"բազում","c":"բազում","t":"բազում"},{"n":"անգամ","c":"անգամ","t":"անգամ"},{"n":"զմտաւ","c":"զմտաւ","t":"զմտաւ"},{"n":"ածեալ","c":"ածեալ","t":"ածեալ"},{"n":"իմ","c":"իմ,","punctuation":[{"char":",","pos":2}],"t":"իմ"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"այս","c":"այս","t":"այս"},{"n":"յետին","c":"յետին","t":"յետին"},{"n":"ժամանակիս","c":"ժամանակիս","t":"ժամանակիս"},{"n":"գրել","c":"գրել","t":"գրել"},{"n":"զդառնաշունչ","c":"զդառնա՛շունչ","punctuation":[{"char":"՛","pos":6}],"t":"զդառնաշունչ"},{"n":"կտծսն","c":"կտծսն։","punctuation":[{"char":"։","pos":5}],"t":"կտծսն"},{"n":"Իսկ","c":"Իսկ","t":"Իսկ","placeholders":["__P__"]},{"n":"ի","c":"ի","t":"ի"},{"n":"լինել","c":"լինել","t":"լինել"},{"n":"թվականութեանս","c":"թվականութեանս","t":"թվականութեանս"},{"n":"հաոց","c":"հա՛ոց","punctuation":[{"char":"՛","pos":2}],"t":"հաոց"},{"n":"ի","c":"ի","t":"ի"},{"n":"յամս","c":"յամս","t":"յամս"},{"n":"500","c":"500","t":"500"},{"n":"և","c":"և","t":"և"},{"n":"2","c":"2","t":"2"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"ահաւոր","c":"ահաւոր","t":"ահաւոր"},{"n":"նշան","c":"նշան","t":"նշան"},{"n":"և","c":"և","t":"և"},{"n":"սոսկալի","c":"սոսկալի","t":"սոսկալի"},{"n":"և","c":"և","t":"և"},{"n":"կտրծ","c":"կտրծ","t":"կտրծ"},{"n":"մեծի","c":"մեծի","t":"մեծի"},{"n":"բարկութեան","c":"բարկութեան","t":"բարկութեան"},{"n":"երևեալ","c":"երևեալ","t":"երևեալ"},{"n":"ի","c":"ի","t":"ի"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"քաղաքն","c":"քաղաքն","t":"քաղաքն"},{"n":"անտիոք","c":"անտիոք,","punctuation":[{"char":",","pos":6}],"t":"անտիոք"},{"n":"զոր","c":"զոր","t":"զոր"},{"n":"ի","c":"ի","t":"ի"},{"n":"ներքո","c":"ներքո՛","punctuation":[{"char":"՛","pos":5}],"t":"ներքո"},{"n":"արեգականս","c":"արեգականս","t":"արեգականս"},{"n":"գործեցաւ","c":"գործեցաւ,","punctuation":[{"char":",","pos":8}],"t":"գործեցաւ"},{"n":"այս","c":"այս","t":"այս"},{"n":"սքանչելիս","c":"սքանչելիս,","punctuation":[{"char":",","pos":9}],"t":"սքանչելիս"},{"n":"որ","c":"որ","t":"որ"},{"n":"էր","c":"էր","t":"էր"},{"n":"տեսանողացն","c":"տեսանողացն","t":"տեսանողացն"},{"n":"ահաւոր","c":"ահաւոր","t":"ահաւոր"},{"n":"և","c":"և","t":"և"},{"n":"հրաշալի","c":"հրաշալի.","punctuation":[{"char":".","pos":7}],"t":"հրաշալի"},{"n":"և","c":"և","t":"և"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"այս","c":"այս","t":"այս"},{"n":"նշանս","c":"նշանս","t":"նշանս"},{"n":"ահ","c":"ա՛հ","punctuation":[{"char":"՛","pos":1}],"t":"ահ"},{"n":"և","c":"և","t":"և"},{"n":"դողումս","c":"դողումս","t":"դողումս"},{"n":"ամենայն","c":"ամենայն","t":"ամենայն"},{"n":"հաւատացելոց","c":"հաւատացելոց","t":"հաւատացելոց"},{"n":"քրիստոսի","c":"քրիստոսի,","punctuation":[{"char":",","pos":8}],"t":"քրիստոսի"},{"n":"զորս","c":"զորս","t":"զորս"},{"n":"այժմ","c":"այժմ","t":"այժմ"},{"n":"մեծասաստ","c":"մեծա՛սաստ","punctuation":[{"char":"՛","pos":4}],"t":"մեծասաստ"},{"n":"սպառնալեօք","c":"սպառնալեօք","t":"սպառնալեօք"},{"n":"յայտնեաց","c":"յայտնեաց","t":"յայտնեաց"},{"n":"աստուած","c":"աստուած","t":"աստուած"},{"n":"զդատաստանսն","c":"զդատաստանսն","t":"զդատաստանսն"},{"n":"ահաւոր","c":"ահաւոր","t":"ահաւոր"},{"n":"և","c":"և","t":"և"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"պատճառ","c":"պատճառ","t":"պատճառ"},{"n":"բարկութեանն","c":"բարկութեանն","t":"բարկութեանն"},{"n":"այսպէս","c":"այսպէս.","punctuation":[{"char":".","pos":6}],"t":"այսպէս"},{"n":"և","c":"և","t":"և"},{"n":"արդ","c":"արդ","t":"արդ"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"զի","c":"զի","t":"զի"},{"n":"ազգն","c":"ազգն","t":"ազգն"},{"n":"ասորոց","c":"ասորոց","t":"ասորոց"},{"n":"բազում","c":"բազում","t":"բազում"},{"n":"էին","c":"էին","t":"էին"},{"n":"ի","c":"ի","t":"ի"},{"n":"քաղաքն","c":"քաղաքն","t":"քաղաքն"},{"n":"անտիոք","c":"անտիոք,","punctuation":[{"char":",","pos":6}],"t":"անտիոք"},{"n":"ոսկով","c":"ոսկով","t":"ոսկով"},{"n":"և","c":"և","t":"և"},{"n":"արծաթով","c":"արծաթով,","punctuation":[{"char":",","pos":7}],"t":"արծաթով"},{"n":"լցեալ","c":"լցեալ","t":"լցեալ"},{"n":"հարստտութեամբ","c":"հարստտութեամբ","t":"հարստտութեամբ"},{"n":"և","c":"և","t":"և"},{"n":"ամենայն","c":"ամենայն","t":"ամենայն"},{"n":"փառաւորութեամբ","c":"փառաւորութեամբ,","punctuation":[{"char":",","pos":14}],"t":"փառաւորութեամբ"},{"n":"և","c":"և","t":"և"},{"n":"մանկունք","c":"մանկունք","t":"մանկունք"},{"n":"նոցա","c":"նոցա","t":"նոցա"},{"n":"յորժամ","c":"յորժամ","t":"յորժամ"},{"n":"երթաին","c":"երթա՛ին","punctuation":[{"char":"՛","pos":4}],"t":"երթաին"},{"n":"յեկեղեցին","c":"յեկեղեցին","t":"յեկեղեցին"},{"n":"իւրոց","c":"իւրոց,","punctuation":[{"char":",","pos":5}],"t":"իւրոց"},{"n":"տղայք","c":"տղայք","t":"տղայք"},{"n":"500","c":"500","t":"500"},{"n":"ի","c":"ի","t":"ի"},{"n":"ջորոջ","c":"ջորոջ","t":"ջորոջ"},{"n":"նստեալ","c":"նստեալ","t":"նստեալ"},{"n":"գաին","c":"գա՛ին,","punctuation":[{"char":"՛","pos":2},{"char":",","pos":5}],"t":"գաին"},{"n":"և","c":"և","t":"և"},{"n":"ազգն","c":"ազգն","t":"ազգն"},{"n":"հոռոմոց","c":"հոռոմոց","t":"հոռոմոց"},{"n":"ոխացեալ","c":"ոխացեալ","t":"ոխացեալ"},{"n":"էին","c":"էին","t":"էին"},{"n":"յոյժ","c":"յոյժ","t":"յոյժ"},{"n":"մեծօ","c":"մեծօ՛","punctuation":[{"char":"՛","pos":4}],"t":"մեծօ"},{"n":"չարութեամբ","c":"չարութեամբ.","punctuation":[{"char":".","pos":10}],"t":"չարութեամբ"},{"n":"և","c":"և","t":"և"},{"n":"մի","c":"մի","t":"մի"},{"n":"ոմն","c":"ոմն","t":"ոմն"},{"n":"իշխան","c":"իշխան","t":"իշխան"},{"n":"ասորոց","c":"ասորոց","t":"ասորոց"},{"n":"ազգէն","c":"ազգէն","t":"ազգէն"},{"n":"ունէր","c":"ունէր","t":"ունէր"},{"n":"կապեալ","c":"կապեալ","t":"կապեալ"},{"n":"բազում","c":"բազում","t":"բազում"},{"n":"և","c":"և","t":"և"},{"n":"յաղագս","c":"յաղագս","t":"յաղագս"},{"n":"այսր","c":"այսր","t":"այսր"},{"n":"պատճառանաց","c":"պատճառանաց","t":"պատճառանաց"},{"n":"լինէր","c":"լինէր","t":"լինէր"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"դատաստան","c":"դատաստան","t":"դատաստան"},{"n":"առաջի","c":"առաջի","t":"առաջի"},{"n":"պատրիարգին","c":"պատրիարգին","t":"պատրիարգին"},{"n":"հոռոմոց","c":"հոռոմոց.","punctuation":[{"char":".","pos":7}],"t":"հոռոմոց"},{"n":"և","c":"և","t":"և"},{"n":"սիրաբար","c":"սիրա՛բար","punctuation":[{"char":"՛","pos":4}],"t":"սիրաբար"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"դատաստանին","c":"դատաստանին,","punctuation":[{"char":",","pos":10}],"t":"դատաստանին"},{"n":"դարձուցին","c":"դարձուցին","t":"դարձուցին"},{"n":"զնա","c":"զնա՛","punctuation":[{"char":"՛","pos":3}],"t":"զնա"},{"n":"ի","c":"ի","t":"ի"},{"n":"հաւատոյն","c":"հաւատոյն","t":"հաւատոյն"},{"n":"իւրո","c":"իւրո՛,","punctuation":[{"char":"՛","pos":4},{"char":",","pos":5}],"t":"իւրո"},{"n":"և","c":"և","t":"և"},{"n":"կնքեցին","c":"կնքեցին","t":"կնքեցին"},{"n":"զնա","c":"զնա՛","punctuation":[{"char":"՛","pos":3}],"t":"զնա"},{"n":"հոռոմ","c":"հոռոմ","t":"հոռոմ"},{"n":"իւր","c":"իւր","t":"իւր"},{"n":"կամաւորութեամբ","c":"կամաւորութեամբ,","punctuation":[{"char":",","pos":14}],"t":"կամաւորութեամբ"},{"n":"և","c":"և","t":"և"},{"n":"իշխանն","c":"իշխանն","t":"իշխանն"},{"n":"որ","c":"որ","t":"որ"},{"n":"զհաւատսն","c":"զհաւատսն","t":"զհաւատսն"},{"n":"եթող","c":"եթող,","punctuation":[{"char":",","pos":4}],"t":"եթող"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"հակառակ","c":"հակառակ","t":"հակառակ"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"ազգին","c":"ազգին","t":"ազգին"},{"n":"ասորոց","c":"ասորոց.","punctuation":[{"char":".","pos":6}],"t":"ասորոց"},{"n":"և","c":"և","t":"և"},{"n":"յայնժամ","c":"յայնժամ","t":"յայնժամ"},{"n":"մատնեցան","c":"մատնեցան","t":"մատնեցան"},{"n":"ասորիք","c":"ասորիք","t":"ասորիք"},{"n":"ի","c":"ի","t":"ի"},{"n":"մեծ","c":"մեծ","t":"մեծ"},{"n":"նեղութիւն","c":"նեղութիւն,","punctuation":[{"char":",","pos":9}],"t":"նեղութիւն"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"զի","c":"զի","t":"զի"},{"n":"սկսան","c":"սկսան","t":"սկսան"},{"n":"հաւատո","c":"հաւատո՛","punctuation":[{"char":"՛","pos":6}],"t":"հաւատո"},{"n":"քննութիւն","c":"քննութիւն","t":"քննութիւն"},{"n":"առնել","c":"առնել","t":"առնել"},{"n":"հանապազօր","c":"հանապազօր.","punctuation":[{"char":".","pos":9}],"t":"հանապազօր"},{"n":"և","c":"և","t":"և"},{"n":"այնչափ","c":"այնչափ","t":"այնչափ"},{"n":"լրբեացալ","c":"լրբեացալ","t":"լրբեացալ"},{"n":"եղեն","c":"եղեն","t":"եղեն"},{"n":"ազգն","c":"ազգն","t":"ազգն"},{"n":"հոռոմոց","c":"հոռոմոց,","punctuation":[{"char":",","pos":7}],"t":"հոռոմոց"},{"n":"որ","c":"որ","t":"որ"},{"n":"և","c":"և","t":"և"},{"n":"զգործեալն","c":"զգործեալն","t":"զգործեալն"},{"n":"ոչ","c":"ոչ","t":"ոչ"},{"n":"կարացին","c":"կարացին","t":"կարացին"},{"n":"գիտել","c":"գիտել,","punctuation":[{"char":",","pos":5}],"t":"գիտել"},{"n":"վասն","c":"վասն","t":"վասն"},{"n":"զի","c":"զի","t":"զի"},{"n":"քրիստոսական","c":"քրիստոսական","t":"քրիստոսական"},{"n":"աւետարանն","c":"աւետարանն","t":"աւետարանն"},{"n":"հրով","c":"հրով","t":"հրով"},{"n":"այրել","c":"այրել","t":"այրել"},{"n":"հրամայեաց","c":"հրամայեաց","t":"հրամայեաց"},{"n":"պատրիարգն","c":"պատրիարգն.","punctuation":[{"char":".","pos":9}],"t":"պատրիարգն"},{"n":"և","c":"և","t":"և"},{"n":"եղև","c":"եղև","t":"եղև"},{"n":"յորժամ","c":"յորժամ","t":"յորժամ"},{"n":"ի","c":"ի","t":"ի"},{"n":"հուրն","c":"հուրն","t":"հուրն"},{"n":"դրին","c":"դրին","t":"դրին"},{"n":"զաւետարանն","c":"զաւետարանն","t":"զաւետարանն"},{"n":"աստուծոյ","c":"աստուծոյ","t":"աստուծոյ"},{"n":"ձայն","c":"ձայն","t":"ձայն"},{"n":"ելանէր","c":"ելանէր","t":"ելանէր"},{"n":"յաւետարանէ","c":"յաւետարանէ","t":"յաւետարանէ"},{"n":"անտի","c":"անտի,","punctuation":[{"char":",","pos":4}],"t":"անտի"},{"n":"և","c":"և","t":"և"},{"n":"ի","c":"ի","t":"ի"},{"n":"կրակէն","c":"կրակէն","t":"կրակէն"},{"n":"ի","c":"ի","t":"ի"},{"n":"դուրս","c":"դուրս","t":"դուրս"},{"n":"ելանէր","c":"ելանէր.","punctuation":[{"char":".","pos":6}],"t":"ելանէր"}],"id":"MsBJ","name":"JSON 2"}]} \ No newline at end of file diff --git a/t/text_tradition_witness.t b/t/text_tradition_witness.t index 3a2518c..d2406e7 100644 --- a/t/text_tradition_witness.t +++ b/t/text_tradition_witness.t @@ -8,18 +8,46 @@ $| = 1; # =begin testing { -use_ok( 'Text::Tradition::Witness', "can use module" ); +use Text::Tradition; +my $trad = Text::Tradition->new( 'name' => 'test tradition' ); +my $c = $trad->collation; -my @text = qw( This is a line of text ); -my $wit = Text::Tradition::Witness->new( +# Test a plaintext witness via string +my $str = 'This is a line of text'; +my $ptwit = $trad->add_witness( 'sigil' => 'A', - 'text' => \@text, + 'sourcetype' => 'plaintext', + 'string' => $str ); -is( ref( $wit ), 'Text::Tradition::Witness', 'Created a witness' ); -if( $wit ) { - is( $wit->sigil, 'A', "Witness has correct sigil" ); - is( join( ' ', @{$wit->text} ), join( ' ', @text ), "Witness has correct text" ); +is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' ); +if( $ptwit ) { + is( $ptwit->sigil, 'A', "Witness has correct sigil" ); + is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" ); } + +# # Test some JSON witnesses via object +# open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input"; +# binmode( JSIN, ':encoding(UTF-8)' ); +# my @lines = ; +# close JSIN; +# $trad->add_json_witnesses( join( '', @lines ) ); +# is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', +# "Found first JSON witness" ); +# is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', +# "Found second JSON witness" ); +# +# # Test an XML witness via file +# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', +# 'file' => 't/data/witnesses/teiwit.xml' ); +# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); +# if( $xmlwit ) { +# is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); +# ok( $xmlwit->is_layered, "Picked up correction layer" ); +# is( @{$xmlwit->path}, 185, "Got correct text length" ); +# is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" ); +# } + +## Test use_text } @@ -27,11 +55,13 @@ if( $wit ) { # =begin testing { use Text::Tradition; +my $trad = Text::Tradition->new(); -my @text = qw( This is a line of text ); -my $wit = Text::Tradition::Witness->new( +my @text = qw/ Thhis is a line of text /; +my $wit = $trad->add_witness( 'sigil' => 'A', - 'text' => \@text, + 'string' => join( ' ', @text ), + 'sourcetype' => 'plaintext', 'identifier' => 'test witness', ); my $jsonstruct = $wit->export_as_json; @@ -44,7 +74,7 @@ foreach my $idx ( 0 .. $#text ) { my @ctext = qw( when april with his showers sweet with fruit the drought of march has pierced unto the root ); -my $trad = Text::Tradition->new( +$trad = Text::Tradition->new( 'input' => 'CollateX', 'file' => 't/data/Collatex-16.xml' ); @@ -55,6 +85,8 @@ is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" ); foreach my $idx ( 0 .. $#ctext ) { is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" ); } + +## TODO test layertext export }