X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FTEI.pm;h=23876ab4262a0bd6646919506c2880cded1c0a1e;hb=3bc0cd189b8f6d8182fe009614993805c56deaf6;hp=03b76cf14aba5d54dbe7b83f8f4f16a2798a6934;hpb=910a0a6d9f858731358772a45e52817b039cf019;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 03b76cf..23876ab 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -30,10 +30,10 @@ the appropriate witness objects. =cut my $text = {}; # Hash of arrays, one per eventual witness we find. -my @common_readings; my $substitutions = {}; # Keep track of merged readings my $app_anchors = {}; # Track apparatus references my $app_ac = {}; # Save a.c. readings +my $app_count; # Keep track of how many apps we have # Create the package variables for tag names. @@ -77,12 +77,16 @@ sub parse { my $source = $wit_el->toString(); $tradition->add_witness( sigil => $sig, source => $source ); } - map { $text->{$_->sigil} = [] } $tradition->witnesses; + # Look for all word/seg node IDs and note their pre-existence. my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); save_preexisting_nodeids( @attrs ); + # Count up how many apps we have. + my @apps = $xpc->findnodes( "//$APP" ); + $app_count = scalar( @apps ); + # Now go through the children of the text element and pull out the # actual text. foreach my $xml_el ( $xpc->findnodes( "//$TEXT" ) ) { @@ -127,6 +131,7 @@ sub parse { } $source = $rdg; } + print STDERR "Adding a.c. version for witness $sig\n"; $tradition->witness( $sig )->uncorrected_path( \@uncorrected ); } } @@ -135,7 +140,11 @@ sub parse { foreach ( keys %$substitutions ) { $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); } - $tradition->collation->calculate_positions( @common_readings ); + $tradition->collation->calculate_ranks(); + + # Now that we have ranks, see if we have distinct nodes with identical + # text and identical rank that can be merged. + $tradition->collation->flatten_ranks(); } sub _clean_sequence { @@ -145,7 +154,8 @@ sub _clean_sequence { if( $rdg =~ /^PH-(.*)$/ ) { # It is a placeholder. Keep it only if we need it. my $app_id = $1; - if( exists $app_ac->{$wit}->{$app_id} ) { + if( exists $app_ac->{$wit} && + exists $app_ac->{$wit}->{$app_id} ) { print STDERR "Retaining empty placeholder for $app_id\n"; push( @clean_sequence, $rdg ); } @@ -196,12 +206,13 @@ sub _return_rdg { ## Returns the list of readings, if any, created on the run. { - my @active_wits; + my %active_wits; my $current_app; + my $seen_apps; sub _get_readings { my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_; - @cur_wits = @active_wits unless $in_var; + @cur_wits = grep { $active_wits{$_} } keys %active_wits unless $in_var; my @new_readings; if( $xn->nodeType == XML_TEXT_NODE ) { @@ -211,8 +222,7 @@ sub _return_rdg { #print STDERR "Handling text node " . $str . "\n"; # Check that all the witnesses we have are active. foreach my $c ( @cur_wits ) { - warn "Could not find $c in active wits" - unless grep { $c eq $_ } @active_wits; + warn "$c is not among active wits" unless $active_wits{$c}; } $str =~ s/^\s+//; my $final = $str =~ s/\s+$//; @@ -222,7 +232,6 @@ sub _return_rdg { my $rdg = make_reading( $tradition->collation, $w ); push( @new_readings, $rdg ); unless( $in_var ) { - push( @common_readings, $rdg ); $rdg->make_common; } foreach ( @cur_wits ) { @@ -236,14 +245,12 @@ sub _return_rdg { #print STDERR "Handling word " . $xn->toString . "\n"; # Check that all the witnesses we have are active. foreach my $c ( @cur_wits ) { - warn "Could not find $c in active wits" - unless grep { $c eq $_ } @active_wits; + warn "$c is not among active wits" unless $active_wits{$c}; } my $xml_id = $xn->getAttribute( 'xml:id' ); my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); push( @new_readings, $rdg ); unless( $in_var ) { - push( @common_readings, $rdg ); $rdg->make_common; } foreach( @cur_wits ) { @@ -252,6 +259,7 @@ sub _return_rdg { push( @{$text->{$_}}, $rdg ) unless $ac; } } elsif ( $xn->nodeName eq 'app' ) { + $seen_apps++; $current_app = $xn->getAttribute( 'xml:id' ); # print STDERR "Handling app $current_app\n"; # Keep the reading sets in this app. @@ -276,6 +284,7 @@ sub _return_rdg { } elsif ( $xn->nodeName eq 'lem' || $xn->nodeName eq 'rdg' ) { # Alter the current witnesses and recurse. #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; + # TODO handle p.c. and s.l. designations too $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; my @rdg_wits = get_sigla( $xn ); @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings @@ -307,12 +316,30 @@ sub _return_rdg { } elsif( $xn->nodeName eq 'witStart' ) { # Add the relevant wit(s) to the active list. #print STDERR "Handling witStart\n"; - push( @active_wits, @cur_wits ); + map { $active_wits{$_} = 1 } @cur_wits; + # Record a lacuna in all non-active witnesses if this is + # the first app. Get the full list from $text. + if( $seen_apps == 1 ) { + my $i = 0; + foreach my $sig ( keys %$text ) { + next if $active_wits{$sig}; + my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); + $i++; + push( @{$text->{$sig}}, $l ); + } + } } elsif( $xn->nodeName eq 'witEnd' ) { # Take the relevant wit(s) out of the list. #print STDERR "Handling witEnd\n"; - my $regexp = '^(' . join( '|', @cur_wits ) . ')$'; - @active_wits = grep { $_ !~ /$regexp/ } @active_wits; + map { $active_wits{$_} = undef } @cur_wits; + # Record a lacuna, unless this is the last app. + unless( $seen_apps == $app_count ) { + foreach my $i ( 0 .. $#cur_wits ) { + my $w = $cur_wits[$i]; + my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); + push( @{$text->{$w}}, $l ); + } + } } elsif( $xn->nodeName eq 'witDetail' ) { # Ignore these for now. return;