X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCollateX.pm;h=3521d9ffb0611590a6448c9ccde5dfcd3a70dc50;hb=fedee8dac79426f8a1f7ae70d95478c6fcd5d69a;hp=474d3ff298f798abb70588c06597f5d105193270;hpb=e867486f69f12dc06304594022c298935d1c7fb9;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 474d3ff..3521d9f 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -2,7 +2,7 @@ package Text::Tradition::Parser::CollateX; use strict; use warnings; -use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /; +use Text::Tradition::Parser::GraphML qw/ graphml_parse /; =head1 NAME @@ -56,16 +56,17 @@ my $t = Text::Tradition->new( 'file' => $cxfile, ); -is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +is( ref( $t ), 'Text::Tradition', "Parsed a CollateX input" ); if( $t ) { is( scalar $t->collation->readings, 26, "Collation has all readings" ); - is( scalar $t->collation->paths, 49, "Collation has all paths" ); + is( scalar $t->collation->paths, 32, "Collation has all paths" ); is( scalar $t->witnesses, 3, "Collation has all witnesses" ); # Check an 'identical' node my $transposed = $t->collation->reading( 'n15' ); - ok( $transposed->has_primary, "Reading links to transposed primary" ); - is( $transposed->primary->name, 'n17', "Correct transposition link" ); + my @related = $transposed->related_readings; + is( scalar @related, 1, "Reading links to transposed version" ); + is( $related[0]->id, 'n18', "Correct transposition link" ); } =end testing @@ -73,101 +74,75 @@ if( $t ) { =cut my $IDKEY = 'number'; -my $CONTENTKEY = 'token'; -my $TRANSKEY = 'identical'; +my $CONTENTKEY = 'tokens'; +my $EDGETYPEKEY = 'type'; +my $WITKEY = 'witnesses'; sub parse { my( $tradition, $opts ) = @_; - my $graph_data = graphml_parse( $opts ); + my( $graph_data ) = graphml_parse( $opts ); my $collation = $tradition->collation; - my %witnesses; # Keep track of the witnesses we encounter as we - # run through the graph data. - # Add the nodes to the graph. First delete the start node, because - # GraphML graphs will have their own start nodes. - $collation->del_reading( $collation->start() ); - $collation->del_reading( $collation->end() ); - - my $extra_data = {}; # Keep track of info to be processed after all - # nodes have been created + # First add the readings to the graph. + ## Assume the start node has no text and id 0, and the end node has + ## no text and ID [number of nodes] - 1. + my $endnode = scalar @{$graph_data->{'nodes'}} - 1; foreach my $n ( @{$graph_data->{'nodes'}} ) { - my %node_data = %$n; - my $nodeid = delete $node_data{$IDKEY}; - my $token = delete $node_data{$CONTENTKEY}; - unless( defined $nodeid && defined $token ) { - warn "Did not find an ID or token for graph node, can't add it"; + unless( defined $n->{$IDKEY} && defined $n->{$CONTENTKEY} ) { + if( defined $n->{$IDKEY} && $n->{$IDKEY} == 0 ) { + # It's the start node. + $n->{$IDKEY} = $collation->start->id; + } elsif ( defined $n->{$IDKEY} && $n->{$IDKEY} == $endnode ) { + # It's the end node. + $n->{$IDKEY} = $collation->end->id; + } else { + # Something is probably wrong. + warn "Did not find an ID or token for graph node, can't add it"; + } next; } - my $gnode = $collation->add_reading( $nodeid ); - $gnode->text( $token ); - - # Whatever is left is extra info to be processed later. - if( keys %node_data ) { - $extra_data->{$nodeid} = \%node_data; - } + # Node ID should be an XML name, so prepend an 'n' if necessary. + if( $n->{$IDKEY} =~ /^\d/ ) { + $n->{$IDKEY} = 'n' . $n->{$IDKEY}; + } + # Create the reading. + my $gnode_args = { + 'id' => $n->{$IDKEY}, + 'text' => $n->{$CONTENTKEY}, + }; + my $gnode = $collation->add_reading( $gnode_args ); } - # Now add the edges. + # Now add the path edges. foreach my $e ( @{$graph_data->{'edges'}} ) { - my %edge_data = %$e; - my $from = delete $edge_data{'source'}; - my $to = delete $edge_data{'target'}; - - # In CollateX, we have a distinct witness data ID per witness, - # so that we can have multiple witnesses per edge. We want to - # translate this to one witness per edge in our own - # representation. - foreach my $ekey ( keys %edge_data ) { - my $wit = $edge_data{$ekey}; - # Create the witness object if it does not yet exist. - unless( $witnesses{$wit} ) { - $tradition->add_witness( 'sigil' => $wit ); - $witnesses{$wit} = 1; - } - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); - } - } - - # Process the extra node data if it exists. - foreach my $nodeid ( keys %$extra_data ) { - my $ed = $extra_data->{$nodeid}; - if( exists $ed->{$TRANSKEY} ) { - - my $tn_reading = $collation->reading( $nodeid ); - my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); - if( $collation->linear ) { - $tn_reading->set_identical( $main_reading ); - } else { - $collation->merge_readings( $main_reading, $tn_reading ); - } - } # else we don't have any other tags to process yet. - } - - # Find the beginning and end nodes of the graph. The beginning node - # has no incoming edges; the end node has no outgoing edges. - my( $begin_node, $end_node ); - foreach my $gnode ( $collation->readings() ) { - # print STDERR "Checking node " . $gnode->name . "\n"; - my @outgoing = $gnode->outgoing(); - my @incoming = $gnode->incoming(); - - unless( scalar @incoming ) { - warn "Already have a beginning node" if $begin_node; - $begin_node = $gnode; - $collation->start( $gnode ); - } - unless( scalar @outgoing ) { - warn "Already have an ending node" if $end_node; - $end_node = $gnode; - $collation->end( $gnode ); + my $from = $e->{'source'}; + my $to = $e->{'target'}; + + ## Edge data keys are ID (which we don't need), witnesses, and type. + ## Type can be 'path' or 'relationship'; + ## witnesses is a comma-separated list. + if( $e->{$EDGETYPEKEY} eq 'path' ) { + ## Add the path for each witness listesd. + # Create the witness objects if they does not yet exist. + foreach my $wit ( split( /, /, $e->{$WITKEY} ) ) { + unless( $tradition->witness( $wit ) ) { + $tradition->add_witness( + 'sigil' => $wit, 'sourcetype' => 'collation' ); + } + $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); + } + } else { # type 'relationship' + $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, + { 'type' => 'transposition' } ); } } - - # Set the $witness->path arrays for each wit. - populate_witness_path( $tradition ); # Rank the readings. - $collation->calculate_ranks(); + $collation->calculate_common_readings(); # will implicitly rank + + # Save the text for each witness so that we can ensure consistency + # later on + $tradition->collation->text_from_paths(); } =head1 BUGS / TODO