X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FTEI.pm;h=78ae54215ad3791d4b3f0d7f0d03e8d73d99dc7d;hb=62a39b8f5d0ae86b26350664828069a2a44f5645;hp=03b76cf14aba5d54dbe7b83f8f4f16a2798a6934;hpb=910a0a6d9f858731358772a45e52817b039cf019;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 03b76cf..78ae542 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -10,30 +10,71 @@ use XML::LibXML::XPathContext; Text::Tradition::Parser::TEI +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'TEI', + 'file' => '/path/to/parallel_seg_file.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'TEI', + 'string' => $parallel_seg_xml, + ); + + =head1 DESCRIPTION -Parser module for Text::Tradition, given a TEI parallel-segmentation -file that describes a text and its variants. +Parser module for Text::Tradition, given a TEI parallel-segmentation file +that describes a text and its variants. Normally called upon +initialization of Text::Tradition. + +The witnesses for the tradition are taken from the element +within the TEI header; the readings are taken from any

element that +appears in the text body (including elements therein.) =head1 METHODS -=over +=head2 B( $tradition, $option_hash ) + +Takes an initialized tradition and a set of options; creates the +appropriate nodes and edges on the graph, as well as the appropriate +witness objects. The $option_hash must contain either a 'file' or a +'string' argument with the XML to be parsed. -=item B +=begin testing -parse( $tei_string ); +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; -Takes an initialized tradition and a string containing the TEI; -creates the appropriate nodes and edges on the graph, as well as -the appropriate witness objects. +my $par_seg = 't/data/florilegium_tei_ps.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'TEI', + 'file' => $par_seg, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed parallel-segmentation TEI" ); +if( $t ) { + is( scalar $t->collation->readings, 311, "Collation has all readings" ); + is( scalar $t->collation->paths, 361, "Collation has all paths" ); +} + +=end testing =cut my $text = {}; # Hash of arrays, one per eventual witness we find. -my @common_readings; my $substitutions = {}; # Keep track of merged readings my $app_anchors = {}; # Track apparatus references my $app_ac = {}; # Save a.c. readings +my $app_count; # Keep track of how many apps we have # Create the package variables for tag names. @@ -41,7 +82,7 @@ my $app_ac = {}; # Save a.c. readings # is considered a bad idea. The long way round then. my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM ) = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' ); -sub make_tagnames { +sub _make_tagnames { my( $ns ) = @_; if( $ns ) { $LISTWIT = "$ns:$LISTWIT"; @@ -57,11 +98,19 @@ sub make_tagnames { # Parse the TEI file. sub parse { - my( $tradition, $xml_str ) = @_; + my( $tradition, $opts ) = @_; # First, parse the XML. my $parser = XML::LibXML->new(); - my $doc = $parser->parse_string( $xml_str ); + my $doc; + if( exists $opts->{'string'} ) { + $doc = $parser->parse_string( $opts->{'string'} ); + } elsif ( exists $opts->{'file'} ) { + $doc = $parser->parse_file( $opts->{'file'} ); + } else { + warn "Could not find string or file option to parse"; + return; + } my $tei = $doc->documentElement(); my $xpc = XML::LibXML::XPathContext->new( $tei ); my $ns; @@ -69,19 +118,23 @@ sub parse { $ns = 'tei'; $xpc->registerNs( $ns, $tei->namespaceURI ); } - make_tagnames( $ns ); + _make_tagnames( $ns ); # Then get the witnesses and create the witness objects. foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) { my $sig = $wit_el->getAttribute( 'xml:id' ); my $source = $wit_el->toString(); - $tradition->add_witness( sigil => $sig, source => $source ); + $tradition->add_witness( sigil => $sig, sourcetype => 'collation' ); } - map { $text->{$_->sigil} = [] } $tradition->witnesses; + # Look for all word/seg node IDs and note their pre-existence. - my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); - save_preexisting_nodeids( @attrs ); + my @attrs = $xpc->findnodes( "//$W/attribute::xml:id" ); + _save_preexisting_nodeids( @attrs ); + + # Count up how many apps we have. + my @apps = $xpc->findnodes( "//$APP" ); + $app_count = scalar( @apps ); # Now go through the children of the text element and pull out the # actual text. @@ -94,60 +147,62 @@ sub parse { # Join them up. my $c = $tradition->collation; foreach my $sig ( keys %$text ) { - next if $sig eq 'base'; # Skip base text readings with no witnesses. # Determine the list of readings for my $sequence = $text->{$sig}; my @real_sequence = ( $c->start ); push( @$sequence, $c->end ); - my $source = $c->start; - foreach( _clean_sequence( $sig, $sequence ) ) { - my $rdg = _return_rdg( $_ ); - push( @real_sequence, $rdg ); - $c->add_path( $source, $rdg, $sig ); - $source = $rdg; + foreach( _clean_sequence( $sig, $sequence, 1 ) ) { + push( @real_sequence, _return_rdg( $_ ) ); } - $tradition->witness( $sig )->path( \@real_sequence ); # See if we need to make an a.c. version of the witness. if( exists $app_ac->{$sig} ) { my @uncorrected; push( @uncorrected, @real_sequence ); + # Get rid of any remaining placeholders. + @real_sequence = _clean_sequence( $sig, \@uncorrected ); + # Do the uncorrections foreach my $app ( keys %{$app_ac->{$sig}} ) { my $start = _return_rdg( $app_anchors->{$app}->{$sig}->{'start'} ); my $end = _return_rdg( $app_anchors->{$app}->{$sig}->{'end'} ); my @new = map { _return_rdg( $_ ) } @{$app_ac->{$sig}->{$app}}; _replace_sequence( \@uncorrected, $start, $end, @new ); } - my $source = $c->start; - foreach my $rdg ( @uncorrected ) { - my $has_base = grep { $_->label eq $sig } $source->edges_to( $rdg ); - if( $rdg ne $c->start && !$has_base ) { - print STDERR sprintf( "Adding path %s from %s -> %s\n", - $sig.$c->ac_label, $source->name, $rdg->name ); - $c->add_path( $source, $rdg, $sig.$c->ac_label ); - } - $source = $rdg; - } + # and record the results. $tradition->witness( $sig )->uncorrected_path( \@uncorrected ); + $tradition->witness( $sig )->is_layered( 1 ); } + $tradition->witness( $sig )->path( \@real_sequence ); } - # Delete readings that are no longer part of the graph. - # TODO think this is useless actually - foreach ( keys %$substitutions ) { - $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); - } - $tradition->collation->calculate_positions( @common_readings ); + # Now make our witness paths. + $tradition->collation->make_witness_paths(); + + # Calculate the ranks for the nodes. + $tradition->collation->calculate_ranks(); + + # Now that we have ranks, see if we have distinct nodes with identical + # text and identical rank that can be merged. + $tradition->collation->flatten_ranks(); + + # And now that we've done that, calculate the common nodes. + $tradition->collation->calculate_common_readings(); + + # Save the text for each witness so that we can ensure consistency + # later on + $tradition->collation->text_from_paths(); } sub _clean_sequence { - my( $wit, $sequence ) = @_; + my( $wit, $sequence, $keep_ac ) = @_; my @clean_sequence; foreach my $rdg ( @$sequence ) { if( $rdg =~ /^PH-(.*)$/ ) { - # It is a placeholder. Keep it only if we need it. + # It is a placeholder. Keep it only if we need it for a later + # a.c. run. my $app_id = $1; - if( exists $app_ac->{$wit}->{$app_id} ) { - print STDERR "Retaining empty placeholder for $app_id\n"; - push( @clean_sequence, $rdg ); + if( $keep_ac && exists $app_ac->{$wit} && + exists $app_ac->{$wit}->{$app_id} ) { + # print STDERR "Retaining empty placeholder for $app_id\n"; + push( @clean_sequence, $rdg ); } } else { push( @clean_sequence, $rdg ); @@ -160,8 +215,10 @@ sub _replace_sequence { my( $arr, $start, $end, @new ) = @_; my( $start_idx, $end_idx ); foreach my $i ( 0 .. $#{$arr} ) { - $start_idx = $i if( $arr->[$i]->name eq $start ); - if( $arr->[$i]->name eq $end ) { + # If $arr->[$i] is a placeholder, cope. + my $iid = ref( $arr->[$i] ) ? $arr->[$i]->id : $arr->[$i]; + $start_idx = $i if( $iid eq $start ); + if( $iid eq $end ) { $end_idx = $i; last; } @@ -180,13 +237,15 @@ sub _return_rdg { # passed a reading object, return the object. my $wantobj = ref( $rdg ) eq 'Text::Tradition::Collation::Reading'; my $real = $rdg; - if( exists $substitutions->{ $wantobj ? $rdg->name : $rdg } ) { - $real = $substitutions->{ $wantobj ? $rdg->name : $rdg }; - $real = $real->name unless $wantobj; + if( exists $substitutions->{ $wantobj ? $rdg->id : $rdg } ) { + $real = $substitutions->{ $wantobj ? $rdg->id : $rdg }; + $real = $real->id unless $wantobj; } return $real; } +## TODO test specific sorts of nodes of the parallel-seg XML. + ## Recursive helper function to help us navigate through nested XML, ## picking out the text. $tradition is the tradition, needed for ## making readings; $xn is the XML node currently being looked at, @@ -196,12 +255,13 @@ sub _return_rdg { ## Returns the list of readings, if any, created on the run. { - my @active_wits; + my %active_wits; my $current_app; + my $seen_apps; sub _get_readings { my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_; - @cur_wits = @active_wits unless $in_var; + @cur_wits = grep { $active_wits{$_} } keys %active_wits unless $in_var; my @new_readings; if( $xn->nodeType == XML_TEXT_NODE ) { @@ -211,20 +271,15 @@ sub _return_rdg { #print STDERR "Handling text node " . $str . "\n"; # Check that all the witnesses we have are active. foreach my $c ( @cur_wits ) { - warn "Could not find $c in active wits" - unless grep { $c eq $_ } @active_wits; + warn "$c is not among active wits" unless $active_wits{$c}; } $str =~ s/^\s+//; my $final = $str =~ s/\s+$//; foreach my $w ( split( /\s+/, $str ) ) { # For now, skip punctuation. next if $w !~ /[[:alnum:]]/; - my $rdg = make_reading( $tradition->collation, $w ); + my $rdg = _make_reading( $tradition->collation, $w ); push( @new_readings, $rdg ); - unless( $in_var ) { - push( @common_readings, $rdg ); - $rdg->make_common; - } foreach ( @cur_wits ) { warn "Empty wit!" unless $_; warn "Empty reading!" unless $rdg; @@ -236,22 +291,18 @@ sub _return_rdg { #print STDERR "Handling word " . $xn->toString . "\n"; # Check that all the witnesses we have are active. foreach my $c ( @cur_wits ) { - warn "Could not find $c in active wits" - unless grep { $c eq $_ } @active_wits; + warn "$c is not among active wits" unless $active_wits{$c}; } my $xml_id = $xn->getAttribute( 'xml:id' ); - my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); + my $rdg = _make_reading( $tradition->collation, $xn->textContent, $xml_id ); push( @new_readings, $rdg ); - unless( $in_var ) { - push( @common_readings, $rdg ); - $rdg->make_common; - } foreach( @cur_wits ) { warn "Empty wit!" unless $_; warn "Empty reading!" unless $rdg; push( @{$text->{$_}}, $rdg ) unless $ac; } } elsif ( $xn->nodeName eq 'app' ) { + $seen_apps++; $current_app = $xn->getAttribute( 'xml:id' ); # print STDERR "Handling app $current_app\n"; # Keep the reading sets in this app. @@ -264,11 +315,10 @@ sub _return_rdg { # Now collate these sets if we have more than one. my $subs = collate_variants( $tradition->collation, @sets ) if @sets > 1; map { $substitutions->{$_} = $subs->{$_} } keys %$subs; - # TODO Look through substitutions to see if we can make anything common now. # Return the entire set of unique readings. my %unique; foreach my $s ( @sets ) { - map { $unique{$_->name} = $_ } @$s; + map { $unique{$_->id} = $_ } @$s; } push( @new_readings, values( %unique ) ); # Exit the current app. @@ -276,9 +326,10 @@ sub _return_rdg { } elsif ( $xn->nodeName eq 'lem' || $xn->nodeName eq 'rdg' ) { # Alter the current witnesses and recurse. #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; + # TODO handle p.c. and s.l. designations too $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; - my @rdg_wits = get_sigla( $xn ); - @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings + my @rdg_wits = _get_sigla( $xn ); + return unless @rdg_wits; # Skip readings that appear in no witnesses my @words; foreach ( $xn->childNodes ) { my @rdg_set = _get_readings( $tradition, $_, 1, $ac, @rdg_wits ); @@ -296,8 +347,8 @@ sub _return_rdg { # Add the reading set to the app anchors for each witness # or put in placeholders for empty p.c. readings foreach ( @rdg_wits ) { - my $start = @words ? $words[0]->name : "PH-$current_app"; - my $end = @words ? $words[-1]->name : "PH-$current_app"; + my $start = @words ? $words[0]->id : "PH-$current_app"; + my $end = @words ? $words[-1]->id : "PH-$current_app"; $app_anchors->{$current_app}->{$_}->{'start'} = $start; $app_anchors->{$current_app}->{$_}->{'end'} = $end; push( @{$text->{$_}}, $start ) unless @words; @@ -307,13 +358,36 @@ sub _return_rdg { } elsif( $xn->nodeName eq 'witStart' ) { # Add the relevant wit(s) to the active list. #print STDERR "Handling witStart\n"; - push( @active_wits, @cur_wits ); + map { $active_wits{$_} = 1 } @cur_wits; + # Record a lacuna in all non-active witnesses if this is + # the first app. Get the full list from $text. + if( $seen_apps == 1 ) { + my $i = 0; + foreach my $sig ( keys %$text ) { + next if $active_wits{$sig}; + my $l = $tradition->collation->add_reading( { + 'id' => $current_app . "_$i", + 'is_lacuna' => 1 } ); + $i++; + push( @{$text->{$sig}}, $l ); + } + } } elsif( $xn->nodeName eq 'witEnd' ) { # Take the relevant wit(s) out of the list. #print STDERR "Handling witEnd\n"; - my $regexp = '^(' . join( '|', @cur_wits ) . ')$'; - @active_wits = grep { $_ !~ /$regexp/ } @active_wits; - } elsif( $xn->nodeName eq 'witDetail' ) { + map { $active_wits{$_} = undef } @cur_wits; + # Record a lacuna, unless this is the last app. + unless( $seen_apps == $app_count ) { + foreach my $i ( 0 .. $#cur_wits ) { + my $w = $cur_wits[$i]; + my $l = $tradition->collation->add_reading( { + 'id' => $current_app . "_$i", + 'is_lacuna' => 1 } ); + push( @{$text->{$w}}, $l ); + } + } + } elsif( $xn->nodeName eq 'witDetail' + || $xn->nodeName eq 'note' ) { # Ignore these for now. return; } else { @@ -328,8 +402,26 @@ sub _return_rdg { } +=begin testing + +use XML::LibXML; +use XML::LibXML::XPathContext; +use Text::Tradition::Parser::TEI; + +my $xml_str = 'some text'; +my $el = XML::LibXML->new()->parse_string( $xml_str )->documentElement; +my $xpc = XML::LibXML::XPathContext->new( $el ); +my $obj = $xpc->find( '//rdg' ); + +my @wits = Text::Tradition::Parser::TEI::_get_sigla( $obj ); +is( join( ' ', @wits) , "A B C D", "correctly parsed reading wit string" ); + +=end testing + +=cut + # Helper to extract a list of witness sigla from a reading element. -sub get_sigla { +sub _get_sigla { my( $rdg ) = @_; # Cope if we have been handed a NodeList. There is only # one reading here. @@ -340,6 +432,7 @@ sub get_sigla { my @wits; if( ref( $rdg ) eq 'XML::LibXML::Element' ) { my $witstr = $rdg->getAttribute( 'wit' ); + return () unless $witstr; $witstr =~ s/^\s+//; $witstr =~ s/\s+$//; @wits = split( /\s+/, $witstr ); @@ -353,13 +446,13 @@ sub get_sigla { my $word_ctr = 0; my %used_nodeids; - sub save_preexisting_nodeids { + sub _save_preexisting_nodeids { foreach( @_ ) { $used_nodeids{$_->getValue()} = 1; } } - sub make_reading { + sub _make_reading { my( $graph, $word, $xml_id ) = @_; if( $xml_id ) { if( exists $used_nodeids{$xml_id} ) { @@ -378,11 +471,35 @@ sub get_sigla { $xml_id = $try_id; } } - my $rdg = $graph->add_reading( $xml_id ); - $rdg->text( $word ); + my $rdg = $graph->add_reading( + { 'id' => $xml_id, + 'text' => $word } + ); $used_nodeids{$xml_id} = $rdg; return $rdg; } } 1; + +=head1 BUGS / TODO + +=over + +=item * More unit testing + +=item * Handle special designations apart from a.c. + +=item * Mark common nodes within collated variants + +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE