X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCTE.pm;h=be6adfcdc9c6aa360a0154487831af0f71231b48;hb=222d58f10f00b766849be205554662b2bbcd29f9;hp=faabf7b778d1d021a614d1e297f0d307b558c68d;hpb=4d85a60eb069976c0cb2a049bb23d626649064fd;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm index faabf7b..be6adfc 100644 --- a/lib/Text/Tradition/Parser/CTE.pm +++ b/lib/Text/Tradition/Parser/CTE.pm @@ -2,6 +2,7 @@ package Text::Tradition::Parser::CTE; use strict; use warnings; +use Encode qw/ decode /; use Text::Tradition::Parser::Util qw/ collate_variants /; use XML::LibXML; use XML::LibXML::XPathContext; @@ -31,30 +32,29 @@ initializes the Tradition from the file. my %sigil_for; # Save the XML IDs for witnesses. my %apps; # Save the apparatus XML for a given ID. +my %has_ac; # Keep track of witnesses that have corrections. sub parse { - my( $tradition, $xml_str ) = @_; + my( $tradition, $opts ) = @_; my $c = $tradition->collation; # Some shorthand # First, parse the XML. - my $parser = XML::LibXML->new(); - my $doc = $parser->parse_string( $xml_str ); - my $tei = $doc->documentElement(); - my $xpc = XML::LibXML::XPathContext->new( $tei ); + my( $tei, $xpc ) = _remove_formatting( $opts ); + return unless $tei; # we have already warned. # CTE uses a DTD rather than any xmlns-based parsing. Thus we # need no namespace handling. - # Get the witnesses and create the witness objects. foreach my $wit_el ( $xpc->findnodes( '//sourceDesc/listWit/witness' ) ) { # The witness xml:id is used internally, and is *not* the sigil name. my $id= $wit_el->getAttribute( 'xml:id' ); - my @sig_parts = $xpc->findnodes( './abbr/descendant::text()', $wit_el ); + my @sig_parts = $xpc->findnodes( 'descendant::text()', $wit_el ); my $sig = _stringify_sigil( @sig_parts ); - $tradition->add_witness( sigil => $sig, source => $wit_el->toString() ); + print STDERR "Adding witness $sig\n"; + $tradition->add_witness( sigil => $sig, sourcetype => 'collation' ); $sigil_for{'#'.$id} = $sig; # Make life easy by keying on the ID ref syntax } - + # Now go through the text and find the base tokens, apparatus tags, and # anchors. Make a giant array of all of these things in sequence. # TODO consider combining this with creation of graph below @@ -64,29 +64,31 @@ sub parse { push( @base_text, _get_base( $xn ) ); } } - # We now have to work through this array applying the alternate # apparatus readings to the base text. Essentially we will put # everything on the graph, from which we will delete the apps and # anchors when we are done. + + # First, put the base tokens, apps, and anchors in the graph. my $counter = 0; my $last = $c->start; foreach my $item ( @base_text ) { my $r; if( $item->{'type'} eq 'token' ) { - $r = $c->add_reading( 'n'.$counter++ ); - $r->text( $item->{'content'} ); + $r = $c->add_reading( { id => 'n'.$counter++, + text => $item->{'content'} } ); } elsif ( $item->{'type'} eq 'anchor' ) { - $r = $c->add_reading( '#ANCHOR_' . $item->{'content'} . '#' ); + $r = $c->add_reading( { id => '__ANCHOR_' . $item->{'content'} . '__', + is_ph => 1 } ); } elsif ( $item->{'type'} eq 'app' ) { - my $tag = '#APP_' . $counter++ . '#'; - $r = $c->add_reading( $tag ); + my $tag = '__APP_' . $counter++ . '__'; + $r = $c->add_reading( { id => $tag, is_ph => 1 } ); $apps{$tag} = $item->{'content'}; } - $c->add_path( $last, $r, 'BASE' ); + $c->add_path( $last, $r, $c->baselabel ); $last = $r; } - $c->add_path( $last, $c->end, 'BASE' ); + $c->add_path( $last, $c->end, $c->baselabel ); # Now we can parse the apparatus entries, and add the variant readings # to the graph. @@ -95,63 +97,64 @@ sub parse { _add_readings( $c, $app_id ); } - # With the variant readings added, we now have to walk the graph for - # each witness and add an explicit path wherever there is not a divergence - # from BASE. Thus we will also construct $wit->path. - $DB::single = 1; - foreach my $wit ( $tradition->witnesses ) { - my $sig = $wit->sigil; - my @wit_path = $c->reading_sequence( $c->start, $c->end, $sig, 'BASE' ); - my $cur = $c->start; - foreach my $n ( @wit_path ) { - next if $cur eq $c->start; - my @paths = $cur->edges_to( $n ); - unless( grep { $_->name eq $sig } @paths ) { - $c->add_path( $cur, $n, $sig ); - } - } - $wit->path( \@wit_path ); - } - - # Collated readings are now on the graph, so now we get to remove - # all BASE edges and all app/anchor nodes. - foreach my $p ( $c->paths ) { - $c->del_path( $p ) if $p->name eq 'BASE'; - } - foreach my $n ( $c->readings ) { - if( $n->name =~ /^\#A(PP|NCHOR)/ ) { - # Pair up incoming / outgoing edges with the same label - my( %incoming, %outgoing ); - foreach my $e ( $n->incoming ) { - $incoming{$e->name} = $e->from; - $c->del_path( $e ); - } - foreach my $e ( $n->outgoing ) { - $outgoing{$e->name} = $e->to; - $c->del_path( $e ); - } - foreach my $w ( keys %incoming ) { - my $from = $incoming{$w}; - my $to = delete $outgoing{$w}; - warn "No outgoing edge on ".$n->name." for wit $w" unless $to; - $c->add_path( $from, $to, $w ); - } - foreach my $w ( keys %outgoing ) { - warn "Found no incoming edge on ".$n->name." for wit $w"; - } - $c->del_reading( $n ); - } - } + # Finally, add explicit witness paths, remove the base paths, and remove + # the app/anchor tags. + expand_all_paths( $c ); + + # Save the text for each witness so that we can ensure consistency + # later on + $tradition->collation->text_from_paths(); + $tradition->collation->calculate_ranks(); + $tradition->collation->flatten_ranks(); } sub _stringify_sigil { my( @nodes ) = @_; my @parts = grep { /\w/ } map { $_->data } @nodes; - return join( '', @parts ); + my $whole = join( '', @parts ); + $whole =~ s/\W//g; + return $whole; } -## Recursive little helper function to help us navigate through nested -## XML, picking out the words, the apparatus, and the anchors. +# Get rid of all the formatting elements that get in the way of tokenization. +sub _remove_formatting { + my( $opts ) = @_; + + # First, parse the original XML + my $parser = XML::LibXML->new(); + my $doc; + if( exists $opts->{'string'} ) { + $doc = $parser->parse_string( $opts->{'string'} ); + } elsif ( exists $opts->{'file'} ) { + $doc = $parser->parse_file( $opts->{'file'} ); + } else { + warn "Could not find string or file option to parse"; + return; + } + + # Second, remove the formatting + my $xpc = XML::LibXML::XPathContext->new( $doc->documentElement ); + my @useless = $xpc->findnodes( '//hi' ); + foreach my $n ( @useless ) { + my $parent = $n->parentNode(); + my @children = $n->childNodes(); + my $first = shift @children; + $parent->replaceChild( $first, $n ); + foreach my $c ( @children ) { + $parent->insertAfter( $c, $first ); + $first = $c; + } + } + + # Third, write out and reparse to merge the text nodes. + my $result = decode( $doc->encoding, $doc->toString() ); + my $tei = $parser->parse_string( $result )->documentElement; + $xpc = XML::LibXML::XPathContext->new( $tei ); + return( $tei, $xpc ); +} + +## Helper function to help us navigate through nested XML, picking out +## the words, the apparatus, and the anchors. sub _get_base { my( $xn ) = @_; @@ -161,14 +164,8 @@ sub _get_base { # to our sequence. my $str = $xn->data; $str =~ s/^\s+//; - foreach my $w ( split( /\s+/, $str ) ) { - push( @readings, { 'type' => 'token', 'content' => $w } ); - } - } elsif( $xn->nodeName eq 'hi' ) { - # Recurse as if the hi weren't there. - foreach( $xn->childNodes ) { - push( @readings, _get_base( $_ ) ); - } + my @tokens = split( /\s+/, $str ); + push( @readings, map { { 'type' => 'token', 'content' => $_ } } @tokens ); } elsif( $xn->nodeName eq 'app' ) { # Apparatus, just save the entire XML node. push( @readings, { 'type' => 'app', 'content' => $xn } ); @@ -182,6 +179,29 @@ sub _get_base { return @readings; } +sub _append_tokens { + my( $list, @tokens ) = @_; + if( @$list && $list->[-1]->{'content'} =~ /\#JOIN\#$/ ) { + # The list evidently ended mid-word; join the next token onto it. + my $t = shift @tokens; + if( ref $t && $t->{'type'} eq 'token' ) { + # Join the word + $t = $t->{'content'}; + } elsif( ref $t ) { + # An app or anchor intervened; end the word. + unshift( @tokens, $t ); + $t = ''; + } + $list->[-1]->{'content'} =~ s/\#JOIN\#$/$t/; + } + foreach my $t ( @tokens ) { + unless( ref( $t ) ) { + $t = { 'type' => 'token', 'content' => $t }; + } + push( @$list, $t ); + } +} + sub _add_readings { my( $c, $app_id ) = @_; my $xn = $apps{$app_id}; @@ -189,55 +209,79 @@ sub _add_readings { # Get the lemma, which is all the readings between app and anchor, # excluding other apps or anchors. my @lemma = _return_lemma( $c, $app_id, $anchor ); - my $lemma_str = join( ' ', grep { $_ !~ /^\#/ } map { $_->text } @lemma ); + my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma ); # For each reading, send its text to 'interpret' along with the lemma, # and then save the list of witnesses that these tokens belong to. - my %wit_rdgs; + my %wit_rdgs; # Maps from witnesses to the variant text my $ctr = 0; my $tag = $app_id; - $tag =~ s/^\#APP_(.*)\#$/$1/; + $tag =~ s/^\__APP_(.*)\__$/$1/; + foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) { my @text; - my $wits = $rdg->getAttribute( 'wit' ); foreach ( $rdg->childNodes ) { push( @text, _get_base( $_ ) ); } - my $interpreted = @text - ? interpret( join( ' ', map { $_->{'content'} } @text ), $lemma_str ) - : ''; + my( $interpreted, $flag ) = ( '', undef ); + if( @text ) { + ( $interpreted, $flag ) = interpret( + join( ' ', map { $_->{'content'} } @text ), $lemma_str ); + } + next if( $interpreted eq $lemma_str ) && !$flag; # Reading is lemma. + my @rdg_nodes; - foreach my $w ( split( /\s+/, $interpreted ) ) { - my $r = $c->add_reading( $tag . "/" . $ctr++ ); - $r->text( $w ); - push( @rdg_nodes, $r ); + if( $interpreted eq '#LACUNA#' ) { + push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, + is_lacuna => 1 } ) ); + } else { + foreach my $w ( split( /\s+/, $interpreted ) ) { + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, + text => $w } ); + push( @rdg_nodes, $r ); + } } - $wit_rdgs{$wits} = \@rdg_nodes; - } - + # For each listed wit, save the reading. + foreach my $wit ( split( /\s+/, $rdg->getAttribute( 'wit' ) ) ) { + $wit .= $flag if $flag; + $wit_rdgs{$wit} = \@rdg_nodes; + } + + # Does the reading have an ID? If so it probably has a witDetail + # attached, and we need to read it. + if( $rdg->hasAttribute( 'xml:id' ) ) { + warn "Witdetail on meta reading" if $flag; # this could get complicated. + my $rid = $rdg->getAttribute( 'xml:id' ); + my $xpc = XML::LibXML::XPathContext->new( $xn ); + my @details = $xpc->findnodes( './witDetail[@target="'.$rid.'"]' ); + foreach my $d ( @details ) { + _parse_wit_detail( $d, \%wit_rdgs, \@lemma ); + } + } + } + # Now collate the variant readings, since it is not done for us. collate_variants( $c, \@lemma, values %wit_rdgs ); - + # Now add the witness paths for each reading. - foreach my $wit_str ( keys %wit_rdgs ) { - my @wits = get_sigla( $wit_str ); - my $rdg_list = $wit_rdgs{$wit_str}; - _add_wit_path( $c, $rdg_list, $app_id, $anchor, @wits ); + foreach my $wit_id ( keys %wit_rdgs ) { + my $witstr = get_sigil( $wit_id, $c ); + my $rdg_list = $wit_rdgs{$wit_id}; + _add_wit_path( $c, $rdg_list, $app_id, $anchor, $witstr ); } } sub _anchor_name { my $xmlid = shift; $xmlid =~ s/^\#//; - return sprintf( "#ANCHOR_%s#", $xmlid ); + return sprintf( "__ANCHOR_%s__", $xmlid ); } sub _return_lemma { my( $c, $app, $anchor ) = @_; - my $app_node = $c->graph->node( $app ); - my $anchor_node = $c->graph->node( $anchor ); - my @nodes = grep { $_->name !~ /^\#A(PP|NCHOR)$/ } - $c->reading_sequence( $app_node, $anchor_node, 'BASE' ); + my @nodes = grep { $_->id !~ /^__A(PP|NCHOR)/ } + $c->reading_sequence( $c->reading( $app ), $c->reading( $anchor ), + $c->baselabel ); return @nodes; } @@ -247,23 +291,43 @@ sub interpret { return $reading if $reading eq $lemma; my $oldreading = $reading; # $lemma =~ s/\s+[[:punct:]]+$//; - # $reading =~ s/\s*\(?sic([\s\w.]+)?\)?$//; + my $flag; # In case of p.c. indications my @words = split( /\s+/, $lemma ); if( $reading =~ /^(.*) praem.$/ ) { $reading = "$1 $lemma"; } elsif( $reading =~ /^(.*) add.$/ ) { $reading = "$lemma $1"; + } elsif( $reading =~ /add. alia manu/ + || $reading =~ /inscriptionem compegi e/ # TODO huh? + || $reading eq 'inc.' # TODO huh? + ) { + # Ignore it. + $reading = $lemma; + } elsif( $reading =~ /locus [uv]acuus/ + || $reading eq 'def.' + || $reading eq 'illeg.' + || $reading eq 'onleesbar' + ) { + $reading = '#LACUNA#'; } elsif( $reading eq 'om.' ) { $reading = ''; - } elsif( $reading eq 'inv.' ) { + } elsif( $reading =~ /^in[uv]\.$/ + || $reading eq 'transp.' ) { # Hope it is two words. print STDERR "WARNING: want to invert a lemma that is not two words\n" unless scalar( @words ) == 2; $reading = join( ' ', reverse( @words ) ); - } elsif( $reading eq 'iter.' ) { + } elsif( $reading =~ /^iter(\.|at)$/ ) { # Repeat the lemma $reading = "$lemma $lemma"; - } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) { + } elsif( $reading eq 'in marg.' ) { + # There was nothing before a correction. + $reading = ''; + $flag = '_ac'; + } elsif( $reading =~ /^(.*?)\s*\(?sic([\s\w.]+)?\)?$/ ) { + # Discard any 'sic' notation; indeed, indeed. + $reading = $1; + } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) { # The first and last N words captured should replace the first and # last N words of the lemma. my @begin = split( /\s+/, $1 ); @@ -277,27 +341,77 @@ sub interpret { $reading = join( ' ', @words ); } } - print STDERR "Interpreted $oldreading as $reading given $lemma\n"; - return $reading; + if( $oldreading ne $reading || $flag || $oldreading =~ /\./ ) { + my $int = $reading; + $int .= " ($flag)" if $flag; + print STDERR "Interpreted $oldreading as $int given $lemma\n"; + } + return( $reading, $flag ); +} + +sub _parse_wit_detail { + my( $detail, $readings, $lemma ) = @_; + my $wit = $detail->getAttribute( 'wit' ); + my $content = $detail->textContent; + if( $content =~ /a\.\s*c\./ ) { + # Replace the key in the $readings hash + my $rdg = delete $readings->{$wit}; + $readings->{$wit.'_ac'} = $rdg; + $has_ac{$sigil_for{$wit}} = 1; + } elsif( $content =~ /p\.\s*c\./ ) { + # If no key for the wit a.c. exists, add one pointing to the lemma + unless( exists $readings->{$wit.'_ac'} ) { + $readings->{$wit.'_ac'} = $lemma; + } + $has_ac{$sigil_for{$wit}} = 1; + } # else don't bother just yet +} + +sub get_sigil { + my( $xml_id, $c ) = @_; + if( $xml_id =~ /^(.*)_ac$/ ) { + my $real_id = $1; + return $sigil_for{$real_id} . $c->ac_label; + } else { + return $sigil_for{$xml_id}; + } } -sub get_sigla { - my $witstr = shift; - my @xml_ids = split( /\s+/, $witstr ); - my @sigs = map { $sigil_for{$_} } @xml_ids; - return @sigs; +sub expand_all_paths { + my( $c ) = @_; + + # Walk the collation and fish out the paths for each witness + foreach my $wit ( $c->tradition->witnesses ) { + my $sig = $wit->sigil; + my @path = grep { !$_->is_ph } + $c->reading_sequence( $c->start, $c->end, $sig ); + $wit->path( \@path ); + if( $has_ac{$sig} ) { + my @ac_path = grep { !$_->is_ph } + $c->reading_sequence( $c->start, $c->end, $sig.$c->ac_label ); + $wit->uncorrected_path( \@ac_path ); + } + } + + # Delete the anchors + foreach my $anchor ( grep { $_->is_ph } $c->readings ) { + $c->del_reading( $anchor ); + } + # Delete the base edges + map { $c->del_path( $_, $c->baselabel ) } $c->paths; + + # Make the path edges + $c->make_witness_paths(); } sub _add_wit_path { - my( $c, $rdg, $app, $anchor, @wits ) = @_; + my( $c, $rdg, $app, $anchor, $wit ) = @_; my @nodes = @$rdg; - push( @nodes, $c->graph->node( $anchor ) ); + push( @nodes, $c->reading( $anchor ) ); - my $cur = $c->graph->node( $app ); + my $cur = $c->reading( $app ); foreach my $n ( @nodes ) { - foreach my $w ( @wits ) { - $c->add_path( $cur, $n, $w ); - } + $c->add_path( $cur, $n, $wit ); $cur = $n; } }