X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FSelf.pm;h=ca58c33bd1cf3c812fb27888012847f09c1445ee;hb=0655d30ce9cda59f3f25091aa4bb50e2b28c65a5;hp=5311660a7369b02ae864e09c0ccc2204cd3b66b0;hpb=d9e873d0f90f1b7aec072c16a0eed37878f7f47f;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 5311660..ca58c33 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -2,129 +2,238 @@ package Text::Tradition::Parser::Self; use strict; use warnings; -use Text::Tradition::Parser::GraphML; +use Text::Tradition::Parser::GraphML qw/ graphml_parse /; =head1 NAME Text::Tradition::Parser::GraphML +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Self', + 'file' => '/path/to/tradition.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Self', + 'string' => $tradition_xml, + ); + =head1 DESCRIPTION Parser module for Text::Tradition to read in its own GraphML output format. -TODO document what this format is. +GraphML is a relatively simple graph description language; a 'graph' element +can have 'node' and 'edge' elements, and each of these can have simple 'data' +elements for attributes to be saved. -=head1 METHODS +The graph itself has attributes as in the Collation object: + +=over + +=item * linear + +=item * ac_label + +=item * baselabel + +=item * wit_list_separator + +=back + +The node objects have the following attributes: + +=over + +=item * name + +=item * reading + +=item * identical + +=item * rank + +=item * class + +=back + +The edge objects have the following attributes: =over -=item B +=item * class -parse( $graph, $graphml_string ); +=item * witness (for 'path' class edges) -Takes an initialized Text::Tradition::Graph object and a string -containing the GraphML; creates the appropriate nodes and edges on the -graph. +=item * extra (for 'path' class edges) + +=item * relationship (for 'relationship' class edges) + +=item * equal_rank (for 'relationship' class edges) + +=item * non_correctable (for 'relationship' class edges) + +=item * non_independent (for 'relationship' class edges) + +=back + +=head1 METHODS + +=head2 B + +parse( $graph, $opts ); + +Takes an initialized Text::Tradition object and a set of options; creates +the appropriate nodes and edges on the graph. The options hash should +include either a 'file' argument or a 'string' argument, depending on the +source of the XML to be parsed. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $tradition = 't/data/florilegium_graphml.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'Self', + 'file' => $tradition, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +if( $t ) { + is( scalar $t->collation->readings, 319, "Collation has all readings" ); + is( scalar $t->collation->paths, 376, "Collation has all paths" ); + is( scalar $t->witnesses, 13, "Collation has all witnesses" ); +} + +=end testing =cut -my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY ) - = qw/ name reading identical rank class /; +my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, + $START_KEY, $END_KEY, $LACUNA_KEY, $COMMON_KEY, + $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY, + $SCOPE_KEY, $CORRECT_KEY, $INDEP_KEY ) + = qw/ id text identical rank + is_start is_end is_lacuna is_common + source target witness extra relationship + scope non_correctable non_independent /; sub parse { - my( $tradition, $graphml_str ) = @_; - my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); - + my( $tradition, $opts ) = @_; + + # Collation data is in the first graph; relationship-specific stuff + # is in the second. + my( $graph_data, $rel_data ) = graphml_parse( $opts ); + my $collation = $tradition->collation; my %witnesses; - + + # print STDERR "Setting graph globals\n"; + $tradition->name( $graph_data->{'name'} ); + my $use_version; + foreach my $gkey ( keys %{$graph_data->{'global'}} ) { + my $val = $graph_data->{'global'}->{$gkey}; + if( $gkey eq 'version' ) { + $use_version = $val; + } else { + $collation->$gkey( $val ); + } + } + # Add the nodes to the graph. - # TODO Are we adding extra start/end nodes? - - my $extra_data = {}; # Keep track of data that needs to be processed - # after the nodes & edges are created. - print STDERR "Adding graph nodes\n"; - foreach my $n ( @{$graph_data->{'nodes'}} ) { - # Each node is either a segment or a reading, depending on - # its class. Readings have text, segments don't. - my %node_data = %$n; - my $nodeid = delete $node_data{$IDKEY}; - my $reading = delete $node_data{$TOKENKEY}; - my $class = $node_data{$CLASS_KEY} || ''; - # TODO this is a hack, fix it? - $class = 'reading' unless $class eq 'segment'; - my $method = $class eq 'segment' ? "add_$class" : "add_reading"; - my $gnode = $collation->$method( $nodeid ); - $gnode->label( $reading ); - $gnode->set_common if $class eq 'common'; - - # Now save the rest of the data, i.e. not the ID or label, - # if it exists. - if ( keys %node_data ) { - $extra_data->{$nodeid} = \%node_data; - } + + # print STDERR "Adding graph nodes\n"; + foreach my $n ( @{$graph_data->{'nodes'}} ) { + # If it is the start or end node, we already have one, so + # grab the rank and go. + next if( defined $n->{$START_KEY} ); + if( defined $n->{$END_KEY} ) { + $collation->end->rank( $n->{$RANK_KEY} ); + next; + } + + # First extract the data that we can use without reference to + # anything else. + + # Create the node. + my $reading_options = { + 'id' => $n->{$IDKEY}, + 'is_lacuna' => $n->{$LACUNA_KEY}, + 'is_common' => $n->{$COMMON_KEY}, + }; + my $rank = $n->{$RANK_KEY}; + $reading_options->{'rank'} = $rank if $rank; + my $text = $n->{$TOKENKEY}; + $reading_options->{'text'} = $text if $text; + + my $gnode = $collation->add_reading( $reading_options ); } # Now add the edges. - print STDERR "Adding graph edges\n"; + # print STDERR "Adding graph edges\n"; foreach my $e ( @{$graph_data->{'edges'}} ) { - my %edge_data = %$e; - my $from = delete $edge_data{'source'}; - my $to = delete $edge_data{'target'}; - my $class = delete $edge_data{'class'}; - - # Whatever is left tells us what kind of edge it is. - foreach my $wkey ( keys %edge_data ) { - if( $wkey =~ /^witness/ ) { - unless( $class eq 'path' ) { - warn "Cannot add witness label to a $class edge"; - next; - } - my $wit = $edge_data{$wkey}; - unless( $witnesses{$wit} ) { - $tradition->add_witness( sigil => $wit ); - $witnesses{$wit} = 1; - } - my $label = $wkey eq 'witness_ante_corr' - ? $wit . $collation->ac_label : $wit; - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); - } elsif( $wkey eq 'relationship' ) { - unless( $class eq 'relationship' ) { - warn "Cannot add relationship label to a $class edge"; - next; - } - my $rel = $edge_data{$wkey}; - # TODO handle global relationships - $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} ); - } else { - my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} ); - $seg_edge->set_attribute( 'class', 'segment' ); - } - } + my $from = $e->{$SOURCE_KEY}; + my $to = $e->{$TARGET_KEY}; + + # We need the witness, and whether it is an 'extra' reading path. + my $wit = $e->{$WITNESS_KEY}; + warn "No witness label on path edge!" unless $wit; + my $extra = $e->{$EXTRA_KEY}; + my $label = $wit . ( $extra ? $collation->ac_label : '' ); + $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); + # Add the witness if we don't have it already. + unless( $witnesses{$wit} ) { + $tradition->add_witness( sigil => $wit ); + $witnesses{$wit} = 1; + } + $tradition->witness( $wit )->is_layered( 1 ) if $extra; } + + ## Done with the main graph, now look at the relationships. + # Nodes are added via the call to add_reading above. We only need + # add the relationships themselves. + # TODO check that scoping does trt + foreach my $e ( @{$rel_data->{'edges'}} ) { + my $from = $e->{$SOURCE_KEY}; + my $to = $e->{$TARGET_KEY}; + my $relationship_opts = { + 'type' => $e->{$RELATIONSHIP_KEY}, + 'scope' => $e->{$SCOPE_KEY}, + }; + $relationship_opts->{'non_correctable'} = $e->{$CORRECT_KEY} + if exists $e->{$CORRECT_KEY}; + $relationship_opts->{'non_independent'} = $e->{$INDEP_KEY} + if exists $e->{$INDEP_KEY}; + $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, + $relationship_opts ); + } + + # Save the text for each witness so that we can ensure consistency + # later on + $tradition->collation->text_from_paths(); - ## Deal with node information (transposition, relationships, etc.) that - ## needs to be processed after all the nodes are created. - print STDERR "Adding second-pass data\n"; - my $linear = undef; - foreach my $nkey ( keys %$extra_data ) { - foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { - my $this_reading = $collation->reading( $nkey ); - if( $edkey eq $TRANSPOS_KEY ) { - my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); - # We evidently have a linear graph. - $linear = 1; - $this_reading->set_identical( $other_reading ); - } elsif ( $edkey eq $RANK_KEY ) { - $this_reading->rank( $extra_data->{$nkey}->{$edkey} ); - } else { - warn "Unfamiliar reading node data $edkey for $nkey"; - } - } - } - $collation->linear( $linear ); - # TODO We probably need to set the $witness->path arrays for each wit. } +1; + +=head1 BUGS / TODO + +=over + +=item * Make this into a stream parser with GraphML + +=item * Simply field -> attribute correspondence for nodes and edges + +=item * Share key name constants with Collation.pm + =back =head1 LICENSE @@ -135,8 +244,4 @@ the same terms as Perl itself. =head1 AUTHOR -Tara L Andrews, aurum@cpan.org - -=cut - -1; +Tara L Andrews Eaurum@cpan.orgE