From: Tara L Andrews Date: Mon, 3 Oct 2011 01:16:33 +0000 (+0200) Subject: add lacunae properly at start of TEI parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3bc0cd189b8f6d8182fe009614993805c56deaf6;p=scpubgit%2Fstemmatology.git add lacunae properly at start of TEI parsing --- diff --git a/.gitignore b/.gitignore index ea4e9d5..00c1605 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ *~ -data tla.bbprojectsettings diff --git a/Tradition.bbprojectd/project.bbprojectdata b/Tradition.bbprojectd/project.bbprojectdata index 6a356f2..f3d8ee0 100644 --- a/Tradition.bbprojectd/project.bbprojectdata +++ b/Tradition.bbprojectd/project.bbprojectdata @@ -6,9 +6,44 @@ BA45EE3F-4E79-4734-A808-E988ECE32C18 E5B4FCC5-00C7-4E1D-963F-D72E7027869A + 8752D0C9-D9C0-484A-ADD9-3243A186536F ProjectItems + 8752D0C9-D9C0-484A-ADD9-3243A186536F + + ItemData + + AliasData + + AAAAAAFkAAIAAQxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA + AAAAAADKPI0jSCsAAAAILogBdAAAAAAAAAAAAAAAAAAA + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + AAAAAAAAAAAAAAAAAAAAAAgx4sose4kAAAAAAAAAAP// + //8AAAkgAAAAAAAAAAAAAAAAAAAADHN0ZW1tYXRvbG9n + eQAQAAgAAMo8cQMAAAARAAgAAMosX2kAAAABABAACC6I + AAckkwAFBAYAAL8xAAIAMk1hY2ludG9zaCBIRDpVc2Vy + czoAdGxhOgBQcm9qZWN0czoAc3RlbW1hdG9sb2d5OgB0 + AA4ABAABAHQADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgA + IABIAEQAEgAhVXNlcnMvdGxhL1Byb2plY3RzL3N0ZW1t + YXRvbG9neS90AAATAAEvAAAVAAIACv//AAA= + + FileURL + file://localhost/Users/tla/Projects/stemmatology/t/ + RelativePath + ./t + TypeID + _CFileLocator + Version + 1 + + ItemName + t + ItemType + FolderReference + UserOverrideItemName + + BA45EE3F-4E79-4734-A808-E988ECE32C18 ItemData diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index cd66989..23876ab 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -61,7 +61,6 @@ sub parse { # First, parse the XML. my $parser = XML::LibXML->new(); - # TODO Try as a string, then try as a filename. my $doc = $parser->parse_string( $xml_str ); my $tei = $doc->documentElement(); my $xpc = XML::LibXML::XPathContext->new( $tei ); @@ -141,9 +140,10 @@ sub parse { foreach ( keys %$substitutions ) { $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); } - - # Calculate the ranks and flatten the graph based on the results. $tradition->collation->calculate_ranks(); + + # Now that we have ranks, see if we have distinct nodes with identical + # text and identical rank that can be merged. $tradition->collation->flatten_ranks(); } @@ -206,13 +206,13 @@ sub _return_rdg { ## Returns the list of readings, if any, created on the run. { - my @active_wits; + my %active_wits; my $current_app; my $seen_apps; sub _get_readings { my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_; - @cur_wits = @active_wits unless $in_var; + @cur_wits = grep { $active_wits{$_} } keys %active_wits unless $in_var; my @new_readings; if( $xn->nodeType == XML_TEXT_NODE ) { @@ -222,8 +222,7 @@ sub _return_rdg { #print STDERR "Handling text node " . $str . "\n"; # Check that all the witnesses we have are active. foreach my $c ( @cur_wits ) { - warn "Could not find $c in active wits" - unless grep { $c eq $_ } @active_wits; + warn "$c is not among active wits" unless $active_wits{$c}; } $str =~ s/^\s+//; my $final = $str =~ s/\s+$//; @@ -246,8 +245,7 @@ sub _return_rdg { #print STDERR "Handling word " . $xn->toString . "\n"; # Check that all the witnesses we have are active. foreach my $c ( @cur_wits ) { - warn "Could not find $c in active wits" - unless grep { $c eq $_ } @active_wits; + warn "$c is not among active wits" unless $active_wits{$c}; } my $xml_id = $xn->getAttribute( 'xml:id' ); my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); @@ -286,6 +284,7 @@ sub _return_rdg { } elsif ( $xn->nodeName eq 'lem' || $xn->nodeName eq 'rdg' ) { # Alter the current witnesses and recurse. #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; + # TODO handle p.c. and s.l. designations too $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; my @rdg_wits = get_sigla( $xn ); @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings @@ -317,12 +316,22 @@ sub _return_rdg { } elsif( $xn->nodeName eq 'witStart' ) { # Add the relevant wit(s) to the active list. #print STDERR "Handling witStart\n"; - push( @active_wits, @cur_wits ); + map { $active_wits{$_} = 1 } @cur_wits; + # Record a lacuna in all non-active witnesses if this is + # the first app. Get the full list from $text. + if( $seen_apps == 1 ) { + my $i = 0; + foreach my $sig ( keys %$text ) { + next if $active_wits{$sig}; + my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); + $i++; + push( @{$text->{$sig}}, $l ); + } + } } elsif( $xn->nodeName eq 'witEnd' ) { # Take the relevant wit(s) out of the list. #print STDERR "Handling witEnd\n"; - my $regexp = '^(' . join( '|', @cur_wits ) . ')$'; - @active_wits = grep { $_ !~ /$regexp/ } @active_wits; + map { $active_wits{$_} = undef } @cur_wits; # Record a lacuna, unless this is the last app. unless( $seen_apps == $app_count ) { foreach my $i ( 0 .. $#cur_wits ) {