From: Tara L Andrews Date: Mon, 16 Apr 2012 09:27:27 +0000 (+0200) Subject: make XML witness parsing work X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b39fb0b309212f41a85d6b4bb90af13aa03095c3;p=scpubgit%2Fstemmatology.git make XML witness parsing work --- diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index d7047ba..e9321f4 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -154,18 +154,30 @@ is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', "Found second JSON witness" ); -# # Test an XML witness via file -# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', -# 'file' => 't/data/witnesses/teiwit.xml' ); -# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); -# if( $xmlwit ) { -# is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); -# ok( $xmlwit->is_layered, "Picked up correction layer" ); -# is( @{$xmlwit->path}, 185, "Got correct text length" ); -# is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" ); -# } +# Test an XML witness via file +my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', + 'file' => 't/data/witnesses/teiwit.xml' ); +is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); +if( $xmlwit ) { + is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); + ok( $xmlwit->is_layered, "Picked up correction layer" ); + is( @{$xmlwit->text}, 182, "Got correct text length" ); + is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" ); +} +my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings; +is( @allwitwords, 184, "Reused appropriate readings" ); ## Test use_text +my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc', + 'file' => 't/data/witnesses/group.xml', + 'use_text' => '//tei:group/tei:text[2]' ); +is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" ); +if( $xpwit ) { + is( $xpwit->sigil, 'G', "XML part witness has correct sigil" ); + ok( !$xpwit->is_layered, "Picked up no correction layer" ); + is( @{$xpwit->text}, 157, "Got correct text length" ); +} + =end testing @@ -198,6 +210,12 @@ has 'sigil' => ( writer => '_set_sigil', ); +has 'language' => ( + is => 'ro', + isa => 'Str', + default => 'Default', + ); + # Other identifying information has 'identifier' => ( is => 'rw', @@ -219,20 +237,14 @@ has 'idno' => ( isa => 'Str', ); +# Source. Can be XML obj, JSON data struct, or string. +# Not used if the witness is created by parsing a collation. has 'sourcetype' => ( is => 'ro', isa => 'SourceType', required => 1, ); -has 'language' => ( - is => 'ro', - isa => 'Str', - default => 'Default', - ); - -# Source. Can be XML obj, JSON data struct, or string. -# Not used if the witness is created by parsing a collation. has 'file' => ( is => 'ro', isa => 'Str', @@ -258,13 +270,6 @@ has 'use_text' => ( isa => 'Str', ); -has 'msdesc' => ( # if we started with a TEI doc - is => 'ro', - isa => 'XML::LibXML::Element', - predicate => 'has_msdesc', - writer => '_save_msdesc', - ); - # Text. This is an array of strings (i.e. word tokens). # TODO Think about how to handle this for the case of pre-prepared # collations, where the tokens are in the graph already. @@ -289,6 +294,7 @@ has 'path' => ( clearer => 'clear_path', ); +## TODO change the name of this has 'uncorrected_path' => ( is => 'rw', isa => 'ArrayRef[Text::Tradition::Collation::Reading]', @@ -365,7 +371,6 @@ sub _init_from_xmldesc { # Get the identifier if( my $desc = $xpc->find( $tags{msDesc} ) ) { my $descnode = $desc->get_node(1); - $self->_save_msdesc( $descnode ); # First try to use settlement/repository/idno. my( $setNode, $reposNode, $idNode ) = ( $xpc->find( $tags{settlement}, $descnode )->get_node(1), @@ -422,13 +427,14 @@ sub _init_from_xmldesc { message => "No text element in document '" . $self->{'identifier'} . "!" ); } + my @text = map { $_->text } @words; + my @layertext = map { $_->text } @layerwords; $self->path( \@words ); - my $a = join( ' ', map { $_->text } @words ); - my $b = join( ' ', map { $_->text } @layerwords ); - if( $a ne $b ) { + $self->text( \@text ); + if( join( ' ', @text ) ne join( ' ', @layertext ) ) { $self->uncorrected_path( \@layerwords ); + $self->layertext( \@layertext ); } - # TODO set self->text } sub _tokenize_text { @@ -471,7 +477,7 @@ sub _objectify_words { # Hunt down each wrapped word/seg, and make an object (or two objects) # of it, if necessary. foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) { - my( $text, $uncorr ) = _get_word_object( $c ); + my( $text, $uncorr ) = _get_word_strings( $c ); # try { # ( $text, $uncorr ) = _get_word_object( $c ); # } catch( Text::Tradition::Error $e @@ -533,8 +539,8 @@ sub _get_word_strings { my $word_excluded = 0; my $xpc = _xpc_for_el( $node ); # TODO This does not cope with nested add/dels. - my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} ); - my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} ); + my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) ); + my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) ); foreach my $c ($node->childNodes() ) { if( $c->nodeName eq 'num' && defined $c->getAttribute( 'value' ) ) { @@ -555,26 +561,28 @@ sub _get_word_strings { $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/; next; } elsif( $c->nodeName eq 'add' ) { - my( $use, $discard ) = _get_text_from_node( $c ); + my( $use, $discard ) = _get_word_strings( $c ); $text .= $use; } elsif( $c->nodeName eq 'del' ) { - my( $discard, $use ) = _get_text_from_node( $c ); + my( $discard, $use ) = _get_word_strings( $c ); $uncorrtext .= $use; } else { - my $tagtxt; + my ( $tagtxt, $taguncorr ); if( ref( $c ) eq 'XML::LibXML::Text' ) { # A text node. $tagtxt = $c->textContent; + $taguncorr = $c->textContent; } else { - $tagtxt = _get_text_from_node( $c ); + ( $tagtxt, $taguncorr ) = _get_word_strings( $c ); } if( $strip_leading_space ) { $tagtxt =~ s/^[\s\n]+//s; + $taguncorr =~ s/^[\s\n]+//s; # Unset the flag as soon as we see non-whitespace. $strip_leading_space = 0 if $tagtxt; } $text .= $tagtxt; - $uncorrtext .= $tagtxt; + $uncorrtext .= $taguncorr; } } throw( ident => "text not found", diff --git a/t/data/witnesses/group.xml b/t/data/witnesses/group.xml new file mode 100644 index 0000000..feb971c --- /dev/null +++ b/t/data/witnesses/group.xml @@ -0,0 +1,67 @@ + + + + + + Lorem ipsum + Cero + + Transcription by + Tara L Andrews + + + +

Unpublished use case

+
+ + + + Lorem Ipsum Test + + + +
+
+ + + A selection of pseudo-Latin texts + + + + +

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec nec mi et felis gravida + hendrerit ornare eget lorem. Proin nisi sem, aliquam eget aliquet ut, lacinia sed erat. + Morbi posuere euismod turpis et volutpat. Mauris suscipit, nisi eget dignissim con +sectetur, tortor felis tristique lorem, lacinia feugiat leo ante vel diam. Duis ac + mauris libero, at suscipit risus. Curabitur orci nunc, commodo sed ornare sed, dictum et + nunc. Nulla facilisi. Suspendisse vestibulum dignissim turpis, ut pellentesque orci + convallis aliquet. Mauris metus purus, ullamcorper ut imperdiet et, tristique fermentum + arcu. Donec congue blandit aliquet. Nunc semper mollis mollis. Nulla tempus, augue vitae + iaculis vulputate, neque diam placerat risus, lacinia luctus purus mauris in ligula. + Fusce vehicula eleifend pharetra. Cras nec libero diam, at semper lacus. Nulla + tristique, ligula id lobortis volutpat, eros metus condimentum orci, in interdum lorem + nisi ut justo.

+ +
+ + +

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec nec mi et felis gravida + hendrerit ornare eget lorem. Proin nisi sem, aliquam eget aliquet ut, lacinia sed erat. + Morbi posuere euismod turpis et volutpat. Mauris suscipit, nisi eget dignissim + consectetur, tortor felis tristique lorem, lacinia feugiat leo ante vel diam. Duis ac + mauris libero, at suscipit risus. Curabitur orci nunc, commodo sed ornare sed, dictum et + nunc. Nulla facilisi. Suspendisse vestibulum dignissim turpis, ut pellentesque orci + convallis aliquet. Mauris metus purus, ullamcorper ut imperdiet et, tristique fermentum + arcu. Donec congue blandit aliquet. Nunc semper mollis mollis. Nulla tempus, augue vitae + iaculis vulputate, neque diam placerat risus, lacinia luctus purus mauris in ligula. + Fusce vehicula eleifend pharetra. Cras nec libero diam, at semper lacus. Nulla + tristique, ligula id lobortis volutpat, eros metus condimentum orci, in interdum lorem + nisi ut justo. Fusce felis ante, vestibulum condimentum aliquet eget, lobortis quis + nibh. Quisque eget malesuada sem. Integer congue luctus rhoncus.

+ +
+
+
+
+ diff --git a/t/data/witnesses/teiwit.xml b/t/data/witnesses/teiwit.xml index cc30d0c..3a0caa0 100644 --- a/t/data/witnesses/teiwit.xml +++ b/t/data/witnesses/teiwit.xml @@ -1,6 +1,5 @@ - - + @@ -35,7 +34,7 @@ ացն որք ականատեսք էին լեա՛լք ամենայն եղելոցս, և նեղութեանցս այսոցիկ զոր վասն մեղաց կրեաց տունն հայո՛ց և ահաւոր զայս բազում անգամ զմտաւ ածեալ եմ վասն այս յետին ժամանակին գրել զդառնաշունչ կտրծսն։

-

Իսկ ի լինել թվականութեանս հայոց ի յամս ԵՃԲ եղև ահ +

Իսկ ի լինել թվականութեանս հայոց ի յամս ԵՃԲ եղև ահ աւոր նշան և սոսկալի, և կտրծ մեծ ի բարկութենէ, երևեալ ի մեծ քաղաքն անտիոք, զոր ի ներքոյ արեգականն գործեցաւ՛ այս սքանչելիս որ է՛ր տես ողացն ահաւոր և հրաշալի. և եղև այս իշխնշանս ա՛հ և դողումն ամենայն հաւատաց ելոց քրիստոսի, զորս յայնժամ մեծասաստ սպառնալեօք յայտնեաց աստուած զդատաստան diff --git a/t/text_tradition_witness.t b/t/text_tradition_witness.t index 20e8f22..2457b2c 100644 --- a/t/text_tradition_witness.t +++ b/t/text_tradition_witness.t @@ -36,18 +36,29 @@ is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', "Found second JSON witness" ); -# # Test an XML witness via file -# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', -# 'file' => 't/data/witnesses/teiwit.xml' ); -# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); -# if( $xmlwit ) { -# is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); -# ok( $xmlwit->is_layered, "Picked up correction layer" ); -# is( @{$xmlwit->path}, 185, "Got correct text length" ); -# is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" ); -# } +# Test an XML witness via file +my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', + 'file' => 't/data/witnesses/teiwit.xml' ); +is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); +if( $xmlwit ) { + is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); + ok( $xmlwit->is_layered, "Picked up correction layer" ); + is( @{$xmlwit->text}, 182, "Got correct text length" ); + is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" ); +} +my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings; +is( @allwitwords, 184, "Reused appropriate readings" ); ## Test use_text +my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc', + 'file' => 't/data/witnesses/group.xml', + 'use_text' => '//tei:group/tei:text[2]' ); +is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" ); +if( $xpwit ) { + is( $xpwit->sigil, 'G', "XML part witness has correct sigil" ); + ok( !$xpwit->is_layered, "Picked up no correction layer" ); + is( @{$xpwit->text}, 157, "Got correct text length" ); +} }