X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCTE.pm;h=be6adfcdc9c6aa360a0154487831af0f71231b48;hb=222d58f10f00b766849be205554662b2bbcd29f9;hp=e706906b6a076c248007a0b6a08e15488344edb6;hpb=861c3e272c65c7553ad7c03cca51cbdd561f126c;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm index e706906..be6adfc 100644 --- a/lib/Text/Tradition/Parser/CTE.pm +++ b/lib/Text/Tradition/Parser/CTE.pm @@ -2,6 +2,7 @@ package Text::Tradition::Parser::CTE; use strict; use warnings; +use Encode qw/ decode /; use Text::Tradition::Parser::Util qw/ collate_variants /; use XML::LibXML; use XML::LibXML::XPathContext; @@ -38,32 +39,22 @@ sub parse { my $c = $tradition->collation; # Some shorthand # First, parse the XML. - my $parser = XML::LibXML->new(); - my $doc; - if( exists $opts->{'string'} ) { - $doc = $parser->parse_string( $opts->{'string'} ); - } elsif ( exists $opts->{'file'} ) { - $doc = $parser->parse_file( $opts->{'file'} ); - } else { - warn "Could not find string or file option to parse"; - return; - } - my $tei = $doc->documentElement(); - my $xpc = XML::LibXML::XPathContext->new( $tei ); + my( $tei, $xpc ) = _remove_formatting( $opts ); + return unless $tei; # we have already warned. # CTE uses a DTD rather than any xmlns-based parsing. Thus we # need no namespace handling. - # Get the witnesses and create the witness objects. foreach my $wit_el ( $xpc->findnodes( '//sourceDesc/listWit/witness' ) ) { # The witness xml:id is used internally, and is *not* the sigil name. my $id= $wit_el->getAttribute( 'xml:id' ); - my @sig_parts = $xpc->findnodes( './abbr/descendant::text()', $wit_el ); + my @sig_parts = $xpc->findnodes( 'descendant::text()', $wit_el ); my $sig = _stringify_sigil( @sig_parts ); - $tradition->add_witness( sigil => $sig, source => $wit_el->toString() ); + print STDERR "Adding witness $sig\n"; + $tradition->add_witness( sigil => $sig, sourcetype => 'collation' ); $sigil_for{'#'.$id} = $sig; # Make life easy by keying on the ID ref syntax } - + # Now go through the text and find the base tokens, apparatus tags, and # anchors. Make a giant array of all of these things in sequence. # TODO consider combining this with creation of graph below @@ -73,7 +64,6 @@ sub parse { push( @base_text, _get_base( $xn ) ); } } - # We now have to work through this array applying the alternate # apparatus readings to the base text. Essentially we will put # everything on the graph, from which we will delete the apps and @@ -88,10 +78,10 @@ sub parse { $r = $c->add_reading( { id => 'n'.$counter++, text => $item->{'content'} } ); } elsif ( $item->{'type'} eq 'anchor' ) { - $r = $c->add_reading( { id => '#ANCHOR_' . $item->{'content'} . '#', + $r = $c->add_reading( { id => '__ANCHOR_' . $item->{'content'} . '__', is_ph => 1 } ); } elsif ( $item->{'type'} eq 'app' ) { - my $tag = '#APP_' . $counter++ . '#'; + my $tag = '__APP_' . $counter++ . '__'; $r = $c->add_reading( { id => $tag, is_ph => 1 } ); $apps{$tag} = $item->{'content'}; } @@ -114,16 +104,57 @@ sub parse { # Save the text for each witness so that we can ensure consistency # later on $tradition->collation->text_from_paths(); + $tradition->collation->calculate_ranks(); + $tradition->collation->flatten_ranks(); } sub _stringify_sigil { my( @nodes ) = @_; my @parts = grep { /\w/ } map { $_->data } @nodes; - return join( '', @parts ); + my $whole = join( '', @parts ); + $whole =~ s/\W//g; + return $whole; +} + +# Get rid of all the formatting elements that get in the way of tokenization. +sub _remove_formatting { + my( $opts ) = @_; + + # First, parse the original XML + my $parser = XML::LibXML->new(); + my $doc; + if( exists $opts->{'string'} ) { + $doc = $parser->parse_string( $opts->{'string'} ); + } elsif ( exists $opts->{'file'} ) { + $doc = $parser->parse_file( $opts->{'file'} ); + } else { + warn "Could not find string or file option to parse"; + return; + } + + # Second, remove the formatting + my $xpc = XML::LibXML::XPathContext->new( $doc->documentElement ); + my @useless = $xpc->findnodes( '//hi' ); + foreach my $n ( @useless ) { + my $parent = $n->parentNode(); + my @children = $n->childNodes(); + my $first = shift @children; + $parent->replaceChild( $first, $n ); + foreach my $c ( @children ) { + $parent->insertAfter( $c, $first ); + $first = $c; + } + } + + # Third, write out and reparse to merge the text nodes. + my $result = decode( $doc->encoding, $doc->toString() ); + my $tei = $parser->parse_string( $result )->documentElement; + $xpc = XML::LibXML::XPathContext->new( $tei ); + return( $tei, $xpc ); } -## Recursive little helper function to help us navigate through nested -## XML, picking out the words, the apparatus, and the anchors. +## Helper function to help us navigate through nested XML, picking out +## the words, the apparatus, and the anchors. sub _get_base { my( $xn ) = @_; @@ -131,17 +162,10 @@ sub _get_base { if( $xn->nodeType == XML_TEXT_NODE ) { # Base text, just split the words on whitespace and add them # to our sequence. - # TODO consider that XML markup might appear mid-token. my $str = $xn->data; $str =~ s/^\s+//; - foreach my $w ( split( /\s+/, $str ) ) { - push( @readings, { 'type' => 'token', 'content' => $w } ); - } - } elsif( $xn->nodeName eq 'hi' ) { - # Recurse as if the hi weren't there. - foreach( $xn->childNodes ) { - push( @readings, _get_base( $_ ) ); - } + my @tokens = split( /\s+/, $str ); + push( @readings, map { { 'type' => 'token', 'content' => $_ } } @tokens ); } elsif( $xn->nodeName eq 'app' ) { # Apparatus, just save the entire XML node. push( @readings, { 'type' => 'app', 'content' => $xn } ); @@ -155,6 +179,29 @@ sub _get_base { return @readings; } +sub _append_tokens { + my( $list, @tokens ) = @_; + if( @$list && $list->[-1]->{'content'} =~ /\#JOIN\#$/ ) { + # The list evidently ended mid-word; join the next token onto it. + my $t = shift @tokens; + if( ref $t && $t->{'type'} eq 'token' ) { + # Join the word + $t = $t->{'content'}; + } elsif( ref $t ) { + # An app or anchor intervened; end the word. + unshift( @tokens, $t ); + $t = ''; + } + $list->[-1]->{'content'} =~ s/\#JOIN\#$/$t/; + } + foreach my $t ( @tokens ) { + unless( ref( $t ) ) { + $t = { 'type' => 'token', 'content' => $t }; + } + push( @$list, $t ); + } +} + sub _add_readings { my( $c, $app_id ) = @_; my $xn = $apps{$app_id}; @@ -162,14 +209,15 @@ sub _add_readings { # Get the lemma, which is all the readings between app and anchor, # excluding other apps or anchors. my @lemma = _return_lemma( $c, $app_id, $anchor ); - my $lemma_str = join( ' ', grep { $_ !~ /^\#/ } map { $_->text } @lemma ); + my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma ); # For each reading, send its text to 'interpret' along with the lemma, # and then save the list of witnesses that these tokens belong to. my %wit_rdgs; # Maps from witnesses to the variant text my $ctr = 0; my $tag = $app_id; - $tag =~ s/^\#APP_(.*)\#$/$1/; + $tag =~ s/^\__APP_(.*)\__$/$1/; + foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) { my @text; foreach ( $rdg->childNodes ) { @@ -184,11 +232,11 @@ sub _add_readings { my @rdg_nodes; if( $interpreted eq '#LACUNA#' ) { - push( @rdg_nodes, $c->add_reading( { id => $tag . "/" . $ctr++, + push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, is_lacuna => 1 } ) ); } else { foreach my $w ( split( /\s+/, $interpreted ) ) { - my $r = $c->add_reading( { id => $tag . "/" . $ctr++, + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, text => $w } ); push( @rdg_nodes, $r ); } @@ -226,12 +274,12 @@ sub _add_readings { sub _anchor_name { my $xmlid = shift; $xmlid =~ s/^\#//; - return sprintf( "#ANCHOR_%s#", $xmlid ); + return sprintf( "__ANCHOR_%s__", $xmlid ); } sub _return_lemma { my( $c, $app, $anchor ) = @_; - my @nodes = grep { $_->id !~ /^\#A(PP|NCHOR)/ } + my @nodes = grep { $_->id !~ /^__A(PP|NCHOR)/ } $c->reading_sequence( $c->reading( $app ), $c->reading( $anchor ), $c->baselabel ); return @nodes; @@ -243,7 +291,6 @@ sub interpret { return $reading if $reading eq $lemma; my $oldreading = $reading; # $lemma =~ s/\s+[[:punct:]]+$//; - # $reading =~ s/\s*\(?sic([\s\w.]+)?\)?$//; my $flag; # In case of p.c. indications my @words = split( /\s+/, $lemma ); if( $reading =~ /^(.*) praem.$/ ) { @@ -258,11 +305,14 @@ sub interpret { $reading = $lemma; } elsif( $reading =~ /locus [uv]acuus/ || $reading eq 'def.' + || $reading eq 'illeg.' + || $reading eq 'onleesbar' ) { $reading = '#LACUNA#'; } elsif( $reading eq 'om.' ) { $reading = ''; - } elsif( $reading =~ /^in[uv]\.$/ ) { + } elsif( $reading =~ /^in[uv]\.$/ + || $reading eq 'transp.' ) { # Hope it is two words. print STDERR "WARNING: want to invert a lemma that is not two words\n" unless scalar( @words ) == 2; @@ -274,6 +324,9 @@ sub interpret { # There was nothing before a correction. $reading = ''; $flag = '_ac'; + } elsif( $reading =~ /^(.*?)\s*\(?sic([\s\w.]+)?\)?$/ ) { + # Discard any 'sic' notation; indeed, indeed. + $reading = $1; } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) { # The first and last N words captured should replace the first and # last N words of the lemma.