X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCollateX.pm;h=62123fe4707106332225f9c2cf28ab5e2f731481;hb=94a077d641e8c906d7131a059b009f335781337a;hp=7775a75b9d83ee106890b426be7e70c5237fc43b;hpb=f6066bac61bc5609c60d48df17aad924c8944177;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 7775a75..62123fe 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -2,12 +2,28 @@ package Text::Tradition::Parser::CollateX; use strict; use warnings; -use Text::Tradition::Parser::GraphML; +use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /; =head1 NAME Text::Tradition::Parser::CollateX +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'CollateX', + 'file' => '/path/to/collation.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'CollateX', + 'string' => $collation_xml, + ); + =head1 DESCRIPTION Parser module for Text::Tradition, given a GraphML file from the @@ -17,15 +33,42 @@ http://gregor.middell.net/collatex/ =head1 METHODS -=over +=head2 B + +parse( $tradition, $init_options ); -=item B +Takes an initialized Text::Tradition object and a set of options; creates +the appropriate nodes and edges on the graph. The options hash should +include either a 'file' argument or a 'string' argument, depending on the +source of the XML to be parsed. -parse( $graph, $graphml_string ); +=begin testing -Takes an initialized Text::Tradition::Graph object and a string -containing the GraphML; creates the appropriate nodes and edges on the -graph. +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $cxfile = 't/data/Collatex-16.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +if( $t ) { + is( scalar $t->collation->readings, 26, "Collation has all readings" ); + is( scalar $t->collation->paths, 49, "Collation has all paths" ); + is( scalar $t->witnesses, 3, "Collation has all witnesses" ); + + # Check an 'identical' node + my $transposed = $t->collation->reading( 'n15' ); + ok( $transposed->has_primary, "Reading links to transposed primary" ); + is( $transposed->primary->name, 'n17', "Correct transposition link" ); +} + +=end testing =cut @@ -34,8 +77,8 @@ my $CONTENTKEY = 'token'; my $TRANSKEY = 'identical'; sub parse { - my( $tradition, $graphml_str ) = @_; - my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); + my( $tradition, $opts ) = @_; + my $graph_data = graphml_parse( $opts ); my $collation = $tradition->collation; my %witnesses; # Keep track of the witnesses we encounter as we # run through the graph data. @@ -43,92 +86,98 @@ sub parse { # Add the nodes to the graph. First delete the start node, because # GraphML graphs will have their own start nodes. $collation->del_reading( $collation->start() ); + $collation->del_reading( $collation->end() ); my $extra_data = {}; # Keep track of info to be processed after all # nodes have been created foreach my $n ( @{$graph_data->{'nodes'}} ) { - my %node_data = %$n; - my $nodeid = delete $node_data{$IDKEY}; - my $token = delete $node_data{$CONTENTKEY}; - unless( defined $nodeid && defined $token ) { - $DB::single = 1; - warn "Did not find an ID or token for graph node, can't add it"; - next; - } - my $gnode = $collation->add_reading( $nodeid ); - $gnode->text( $token ); - - # Whatever is left is extra info to be processed later. - if( keys %node_data ) { - $extra_data->{$nodeid} = \%node_data; - } + my %node_data = %$n; + my $nodeid = delete $node_data{$IDKEY}; + my $token = delete $node_data{$CONTENTKEY}; + unless( defined $nodeid && defined $token ) { + warn "Did not find an ID or token for graph node, can't add it"; + next; + } + my $gnode = $collation->add_reading( $nodeid ); + $gnode->text( $token ); + + # Whatever is left is extra info to be processed later. + if( keys %node_data ) { + $extra_data->{$nodeid} = \%node_data; + } } - + # Now add the edges. foreach my $e ( @{$graph_data->{'edges'}} ) { - my %edge_data = %$e; - my $from = delete $edge_data{'source'}; - my $to = delete $edge_data{'target'}; - - # In CollateX, we have a distinct witness data ID per witness, - # so that we can have multiple witnesses per edge. We want to - # translate this to one witness per edge in our own - # representation. - foreach my $ekey ( keys %edge_data ) { - my $wit = $edge_data{$ekey}; - # Create the witness object if it does not yet exist. - unless( $witnesses{$wit} ) { - $tradition->add_witness( 'sigil' => $wit ); - $witnesses{$wit} = 1; - } - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); - } + my %edge_data = %$e; + my $from = delete $edge_data{'source'}; + my $to = delete $edge_data{'target'}; + + # In CollateX, we have a distinct witness data ID per witness, + # so that we can have multiple witnesses per edge. We want to + # translate this to one witness per edge in our own + # representation. + foreach my $ekey ( keys %edge_data ) { + my $wit = $edge_data{$ekey}; + # Create the witness object if it does not yet exist. + unless( $witnesses{$wit} ) { + $tradition->add_witness( 'sigil' => $wit ); + $witnesses{$wit} = 1; + } + $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); + } } # Process the extra node data if it exists. foreach my $nodeid ( keys %$extra_data ) { - my $ed = $extra_data->{$nodeid}; - if( exists $ed->{$TRANSKEY} ) { - - my $tn_reading = $collation->reading( $nodeid ); - my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); - if( $collation->linear ) { - $tn_reading->set_identical( $main_reading ); - } else { - $collation->merge_readings( $main_reading, $tn_reading ); - } - } # else we don't have any other tags to process yet. + my $ed = $extra_data->{$nodeid}; + if( exists $ed->{$TRANSKEY} ) { + + my $tn_reading = $collation->reading( $nodeid ); + my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); + if( $collation->linear ) { + $tn_reading->set_identical( $main_reading ); + } else { + $collation->merge_readings( $main_reading, $tn_reading ); + } + } # else we don't have any other tags to process yet. } # Find the beginning and end nodes of the graph. The beginning node # has no incoming edges; the end node has no outgoing edges. my( $begin_node, $end_node ); foreach my $gnode ( $collation->readings() ) { - # print STDERR "Checking node " . $gnode->name . "\n"; - my @outgoing = $gnode->outgoing(); - my @incoming = $gnode->incoming(); - - unless( scalar @incoming ) { - warn "Already have a beginning node" if $begin_node; - $begin_node = $gnode; - $collation->start( $gnode ); - } - unless( scalar @outgoing ) { - warn "Already have an ending node" if $end_node; - $end_node = $gnode; - } + # print STDERR "Checking node " . $gnode->name . "\n"; + my @outgoing = $gnode->outgoing(); + my @incoming = $gnode->incoming(); + + unless( scalar @incoming ) { + warn "Already have a beginning node" if $begin_node; + $begin_node = $gnode; + $collation->start( $gnode ); + } + unless( scalar @outgoing ) { + warn "Already have an ending node" if $end_node; + $end_node = $gnode; + $collation->end( $gnode ); + } } + + # Set the $witness->path arrays for each wit. + populate_witness_path( $tradition ); - # Record for each witness its sequence of readings, and determine - # the common nodes. - my @common_nodes = $collation->walk_witness_paths( $end_node ); - - # Now we have added the witnesses and their paths, so have also - # implicitly marked the common nodes. Now we can calculate their - # explicit positions. - $collation->calculate_positions( @common_nodes ); + # Rank the readings. + $collation->calculate_ranks() if $collation->linear; } +=head1 BUGS / TODO + +=over + +=item * Make this into a stream parser with GraphML + +=item * Use CollateX-calculated ranks instead of recalculating our own + =back =head1 LICENSE