X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FTEI.pm;h=3d5fd03543647eeb64adcd5ed19fedb6da07e81a;hb=15db7774a381c3ffff41a26bcb9f9e7bc9e65515;hp=7b68ba2d4103d297d4ac9103aed1ecc959ad36ea;hpb=dfc37e3886452920c1332a045f9102ce29457af1;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 7b68ba2..3d5fd03 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -10,22 +10,63 @@ use XML::LibXML::XPathContext; Text::Tradition::Parser::TEI +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'TEI', + 'file' => '/path/to/parallel_seg_file.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'TEI', + 'string' => $parallel_seg_xml, + ); + + =head1 DESCRIPTION -Parser module for Text::Tradition, given a TEI parallel-segmentation -file that describes a text and its variants. +Parser module for Text::Tradition, given a TEI parallel-segmentation file +that describes a text and its variants. Normally called upon +initialization of Text::Tradition. + +The witnesses for the tradition are taken from the element +within the TEI header; the readings are taken from any

element that +appears in the text body (including elements therein.) =head1 METHODS -=over +=head2 B( $tradition, $option_hash ) -=item B +Takes an initialized tradition and a set of options; creates the +appropriate nodes and edges on the graph, as well as the appropriate +witness objects. The $option_hash must contain either a 'file' or a +'string' argument with the XML to be parsed. -parse( $tei_string ); +=begin testing -Takes an initialized tradition and a string containing the TEI; -creates the appropriate nodes and edges on the graph, as well as -the appropriate witness objects. +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $par_seg = 't/data/florilegium_tei_ps.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'TEI', + 'file' => $par_seg, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed parallel-segmentation TEI" ); +if( $t ) { + is( scalar $t->collation->readings, 311, "Collation has all readings" ); + is( scalar $t->collation->paths, 361, "Collation has all paths" ); +} + +=end testing =cut @@ -41,7 +82,7 @@ my $app_count; # Keep track of how many apps we have # is considered a bad idea. The long way round then. my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM ) = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' ); -sub make_tagnames { +sub _make_tagnames { my( $ns ) = @_; if( $ns ) { $LISTWIT = "$ns:$LISTWIT"; @@ -77,7 +118,7 @@ sub parse { $ns = 'tei'; $xpc->registerNs( $ns, $tei->namespaceURI ); } - make_tagnames( $ns ); + _make_tagnames( $ns ); # Then get the witnesses and create the witness objects. foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) { @@ -88,8 +129,8 @@ sub parse { map { $text->{$_->sigil} = [] } $tradition->witnesses; # Look for all word/seg node IDs and note their pre-existence. - my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); - save_preexisting_nodeids( @attrs ); + my @attrs = $xpc->findnodes( "//$W/attribute::xml:id" ); + _save_preexisting_nodeids( @attrs ); # Count up how many apps we have. my @apps = $xpc->findnodes( "//$APP" ); @@ -106,7 +147,6 @@ sub parse { # Join them up. my $c = $tradition->collation; foreach my $sig ( keys %$text ) { - next if $sig eq 'base'; # Skip base text readings with no witnesses. # Determine the list of readings for my $sequence = $text->{$sig}; my @real_sequence = ( $c->start ); @@ -118,7 +158,6 @@ sub parse { $c->add_path( $source, $rdg, $sig ); $source = $rdg; } - $tradition->witness( $sig )->path( \@real_sequence ); # See if we need to make an a.c. version of the witness. if( exists $app_ac->{$sig} ) { my @uncorrected; @@ -129,30 +168,33 @@ sub parse { my @new = map { _return_rdg( $_ ) } @{$app_ac->{$sig}->{$app}}; _replace_sequence( \@uncorrected, $start, $end, @new ); } - my $source = $c->start; + my $source = shift @uncorrected; # the start node + warn "Something weird!" unless $source eq $c->start; foreach my $rdg ( @uncorrected ) { - my $has_base = grep { $_->label eq $sig } $source->edges_to( $rdg ); - if( $rdg ne $c->start && !$has_base ) { - print STDERR sprintf( "Adding path %s from %s -> %s\n", - $sig.$c->ac_label, $source->name, $rdg->name ); - $c->add_path( $source, $rdg, $sig.$c->ac_label ); - } - $source = $rdg; + unless( $c->has_path( $source, $rdg, $sig ) ) { + $c->add_path( $source, $rdg, $sig.$c->ac_label ); + } + $source = $rdg; } - print STDERR "Adding a.c. version for witness $sig\n"; - $tradition->witness( $sig )->uncorrected_path( \@uncorrected ); + warn "Something else weird!" unless $source eq $c->end; + # print STDERR "Adding a.c. version for witness $sig\n"; + $tradition->witness( $sig )->is_layered( 1 ); } } - # Delete readings that are no longer part of the graph. - # TODO think this is useless actually - foreach ( keys %$substitutions ) { - $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); - } - $tradition->collation->calculate_ranks(); + + # Calculate the ranks for the nodes. + $tradition->collation->calculate_ranks(); # Now that we have ranks, see if we have distinct nodes with identical # text and identical rank that can be merged. $tradition->collation->flatten_ranks(); + + # And now that we've done that, calculate the common nodes. + $tradition->collation->calculate_common_readings(); + + # Save the text for each witness so that we can ensure consistency + # later on + $tradition->collation->text_from_paths(); } sub _clean_sequence { @@ -164,7 +206,7 @@ sub _clean_sequence { my $app_id = $1; if( exists $app_ac->{$wit} && exists $app_ac->{$wit}->{$app_id} ) { - print STDERR "Retaining empty placeholder for $app_id\n"; + # print STDERR "Retaining empty placeholder for $app_id\n"; push( @clean_sequence, $rdg ); } } else { @@ -178,8 +220,8 @@ sub _replace_sequence { my( $arr, $start, $end, @new ) = @_; my( $start_idx, $end_idx ); foreach my $i ( 0 .. $#{$arr} ) { - $start_idx = $i if( $arr->[$i]->name eq $start ); - if( $arr->[$i]->name eq $end ) { + $start_idx = $i if( $arr->[$i]->id eq $start ); + if( $arr->[$i]->id eq $end ) { $end_idx = $i; last; } @@ -198,13 +240,15 @@ sub _return_rdg { # passed a reading object, return the object. my $wantobj = ref( $rdg ) eq 'Text::Tradition::Collation::Reading'; my $real = $rdg; - if( exists $substitutions->{ $wantobj ? $rdg->name : $rdg } ) { - $real = $substitutions->{ $wantobj ? $rdg->name : $rdg }; - $real = $real->name unless $wantobj; + if( exists $substitutions->{ $wantobj ? $rdg->id : $rdg } ) { + $real = $substitutions->{ $wantobj ? $rdg->id : $rdg }; + $real = $real->id unless $wantobj; } return $real; } +## TODO test specific sorts of nodes of the parallel-seg XML. + ## Recursive helper function to help us navigate through nested XML, ## picking out the text. $tradition is the tradition, needed for ## making readings; $xn is the XML node currently being looked at, @@ -237,11 +281,8 @@ sub _return_rdg { foreach my $w ( split( /\s+/, $str ) ) { # For now, skip punctuation. next if $w !~ /[[:alnum:]]/; - my $rdg = make_reading( $tradition->collation, $w ); + my $rdg = _make_reading( $tradition->collation, $w ); push( @new_readings, $rdg ); - unless( $in_var ) { - $rdg->make_common; - } foreach ( @cur_wits ) { warn "Empty wit!" unless $_; warn "Empty reading!" unless $rdg; @@ -256,11 +297,8 @@ sub _return_rdg { warn "$c is not among active wits" unless $active_wits{$c}; } my $xml_id = $xn->getAttribute( 'xml:id' ); - my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); + my $rdg = _make_reading( $tradition->collation, $xn->textContent, $xml_id ); push( @new_readings, $rdg ); - unless( $in_var ) { - $rdg->make_common; - } foreach( @cur_wits ) { warn "Empty wit!" unless $_; warn "Empty reading!" unless $rdg; @@ -284,7 +322,7 @@ sub _return_rdg { # Return the entire set of unique readings. my %unique; foreach my $s ( @sets ) { - map { $unique{$_->name} = $_ } @$s; + map { $unique{$_->id} = $_ } @$s; } push( @new_readings, values( %unique ) ); # Exit the current app. @@ -294,8 +332,8 @@ sub _return_rdg { #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; # TODO handle p.c. and s.l. designations too $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; - my @rdg_wits = get_sigla( $xn ); - @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings + my @rdg_wits = _get_sigla( $xn ); + return unless @rdg_wits; # Skip readings that appear in no witnesses my @words; foreach ( $xn->childNodes ) { my @rdg_set = _get_readings( $tradition, $_, 1, $ac, @rdg_wits ); @@ -313,8 +351,8 @@ sub _return_rdg { # Add the reading set to the app anchors for each witness # or put in placeholders for empty p.c. readings foreach ( @rdg_wits ) { - my $start = @words ? $words[0]->name : "PH-$current_app"; - my $end = @words ? $words[-1]->name : "PH-$current_app"; + my $start = @words ? $words[0]->id : "PH-$current_app"; + my $end = @words ? $words[-1]->id : "PH-$current_app"; $app_anchors->{$current_app}->{$_}->{'start'} = $start; $app_anchors->{$current_app}->{$_}->{'end'} = $end; push( @{$text->{$_}}, $start ) unless @words; @@ -331,7 +369,9 @@ sub _return_rdg { my $i = 0; foreach my $sig ( keys %$text ) { next if $active_wits{$sig}; - my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); + my $l = $tradition->collation->add_reading( { + 'id' => $current_app . "_$i", + 'is_lacuna' => 1 } ); $i++; push( @{$text->{$sig}}, $l ); } @@ -344,11 +384,14 @@ sub _return_rdg { unless( $seen_apps == $app_count ) { foreach my $i ( 0 .. $#cur_wits ) { my $w = $cur_wits[$i]; - my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); + my $l = $tradition->collation->add_reading( { + 'id' => $current_app . "_$i", + 'is_lacuna' => 1 } ); push( @{$text->{$w}}, $l ); } } - } elsif( $xn->nodeName eq 'witDetail' ) { + } elsif( $xn->nodeName eq 'witDetail' + || $xn->nodeName eq 'note' ) { # Ignore these for now. return; } else { @@ -363,8 +406,26 @@ sub _return_rdg { } +=begin testing + +use XML::LibXML; +use XML::LibXML::XPathContext; +use Text::Tradition::Parser::TEI; + +my $xml_str = 'some text'; +my $el = XML::LibXML->new()->parse_string( $xml_str )->documentElement; +my $xpc = XML::LibXML::XPathContext->new( $el ); +my $obj = $xpc->find( '//rdg' ); + +my @wits = Text::Tradition::Parser::TEI::_get_sigla( $obj ); +is( join( ' ', @wits) , "A B C D", "correctly parsed reading wit string" ); + +=end testing + +=cut + # Helper to extract a list of witness sigla from a reading element. -sub get_sigla { +sub _get_sigla { my( $rdg ) = @_; # Cope if we have been handed a NodeList. There is only # one reading here. @@ -375,6 +436,7 @@ sub get_sigla { my @wits; if( ref( $rdg ) eq 'XML::LibXML::Element' ) { my $witstr = $rdg->getAttribute( 'wit' ); + return () unless $witstr; $witstr =~ s/^\s+//; $witstr =~ s/\s+$//; @wits = split( /\s+/, $witstr ); @@ -388,13 +450,13 @@ sub get_sigla { my $word_ctr = 0; my %used_nodeids; - sub save_preexisting_nodeids { + sub _save_preexisting_nodeids { foreach( @_ ) { $used_nodeids{$_->getValue()} = 1; } } - sub make_reading { + sub _make_reading { my( $graph, $word, $xml_id ) = @_; if( $xml_id ) { if( exists $used_nodeids{$xml_id} ) { @@ -413,11 +475,35 @@ sub get_sigla { $xml_id = $try_id; } } - my $rdg = $graph->add_reading( $xml_id ); - $rdg->text( $word ); + my $rdg = $graph->add_reading( + { 'id' => $xml_id, + 'text' => $word } + ); $used_nodeids{$xml_id} = $rdg; return $rdg; } } 1; + +=head1 BUGS / TODO + +=over + +=item * More unit testing + +=item * Handle special designations apart from a.c. + +=item * Mark common nodes within collated variants + +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE