X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCTE.pm;h=be6adfcdc9c6aa360a0154487831af0f71231b48;hb=222d58f10f00b766849be205554662b2bbcd29f9;hp=af989c7fadedb50d5973f72aa102ccba1042414b;hpb=c9158e60d47ce702857a268db2e337257adf619d;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm index af989c7..be6adfc 100644 --- a/lib/Text/Tradition/Parser/CTE.pm +++ b/lib/Text/Tradition/Parser/CTE.pm @@ -51,7 +51,7 @@ sub parse { my @sig_parts = $xpc->findnodes( 'descendant::text()', $wit_el ); my $sig = _stringify_sigil( @sig_parts ); print STDERR "Adding witness $sig\n"; - $tradition->add_witness( sigil => $sig, source => $wit_el->toString() ); + $tradition->add_witness( sigil => $sig, sourcetype => 'collation' ); $sigil_for{'#'.$id} = $sig; # Make life easy by keying on the ID ref syntax } @@ -78,10 +78,10 @@ sub parse { $r = $c->add_reading( { id => 'n'.$counter++, text => $item->{'content'} } ); } elsif ( $item->{'type'} eq 'anchor' ) { - $r = $c->add_reading( { id => '#ANCHOR_' . $item->{'content'} . '#', + $r = $c->add_reading( { id => '__ANCHOR_' . $item->{'content'} . '__', is_ph => 1 } ); } elsif ( $item->{'type'} eq 'app' ) { - my $tag = '#APP_' . $counter++ . '#'; + my $tag = '__APP_' . $counter++ . '__'; $r = $c->add_reading( { id => $tag, is_ph => 1 } ); $apps{$tag} = $item->{'content'}; } @@ -111,7 +111,9 @@ sub parse { sub _stringify_sigil { my( @nodes ) = @_; my @parts = grep { /\w/ } map { $_->data } @nodes; - return join( '', @parts ); + my $whole = join( '', @parts ); + $whole =~ s/\W//g; + return $whole; } # Get rid of all the formatting elements that get in the way of tokenization. @@ -207,14 +209,14 @@ sub _add_readings { # Get the lemma, which is all the readings between app and anchor, # excluding other apps or anchors. my @lemma = _return_lemma( $c, $app_id, $anchor ); - my $lemma_str = join( ' ', grep { $_ !~ /^\#/ } map { $_->text } @lemma ); + my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma ); # For each reading, send its text to 'interpret' along with the lemma, # and then save the list of witnesses that these tokens belong to. my %wit_rdgs; # Maps from witnesses to the variant text my $ctr = 0; my $tag = $app_id; - $tag =~ s/^\#APP_(.*)\#$/$1/; + $tag =~ s/^\__APP_(.*)\__$/$1/; foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) { my @text; @@ -230,11 +232,11 @@ sub _add_readings { my @rdg_nodes; if( $interpreted eq '#LACUNA#' ) { - push( @rdg_nodes, $c->add_reading( { id => $tag . "/" . $ctr++, + push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, is_lacuna => 1 } ) ); } else { foreach my $w ( split( /\s+/, $interpreted ) ) { - my $r = $c->add_reading( { id => $tag . "/" . $ctr++, + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, text => $w } ); push( @rdg_nodes, $r ); } @@ -259,7 +261,6 @@ sub _add_readings { } # Now collate the variant readings, since it is not done for us. - $DB::single = 1 if @lemma > 10; collate_variants( $c, \@lemma, values %wit_rdgs ); # Now add the witness paths for each reading. @@ -273,12 +274,12 @@ sub _add_readings { sub _anchor_name { my $xmlid = shift; $xmlid =~ s/^\#//; - return sprintf( "#ANCHOR_%s#", $xmlid ); + return sprintf( "__ANCHOR_%s__", $xmlid ); } sub _return_lemma { my( $c, $app, $anchor ) = @_; - my @nodes = grep { $_->id !~ /^\#A(PP|NCHOR)/ } + my @nodes = grep { $_->id !~ /^__A(PP|NCHOR)/ } $c->reading_sequence( $c->reading( $app ), $c->reading( $anchor ), $c->baselabel ); return @nodes;