X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FSelf.pm;h=8483891b9964c645437ea3d2e5e14b1855205f2f;hb=2a8127263ef278f3f14b480a12b84f9aa4f92fdc;hp=781a73913fe0aadd60e29f9fdc5a60eced4255b3;hpb=32014ec936b48809e9f2dae8f20a01c887253427;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 781a739..8483891 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -2,113 +2,289 @@ package Text::Tradition::Parser::Self; use strict; use warnings; -use Text::Tradition::Parser::GraphML; +use Text::Tradition::Parser::GraphML qw/ graphml_parse /; +use TryCatch; =head1 NAME Text::Tradition::Parser::GraphML +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Self', + 'file' => '/path/to/tradition.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Self', + 'string' => $tradition_xml, + ); + =head1 DESCRIPTION Parser module for Text::Tradition to read in its own GraphML output format. -TODO document what this format is. +GraphML is a relatively simple graph description language; a 'graph' element +can have 'node' and 'edge' elements, and each of these can have simple 'data' +elements for attributes to be saved. -=head1 METHODS +The graph itself has attributes as in the Collation object: =over -=item B +=item * linear -parse( $graph, $graphml_string ); +=item * ac_label -Takes an initialized Text::Tradition::Graph object and a string -containing the GraphML; creates the appropriate nodes and edges on the -graph. +=item * baselabel -=cut +=item * wit_list_separator + +=back -my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY ) - = qw/ name reading identical position /; +The node objects have the following attributes: -sub parse { - my( $tradition, $graphml_str ) = @_; - my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); +=over + +=item * name + +=item * reading + +=item * identical + +=item * rank + +=item * class + +=back + +The edge objects have the following attributes: + +=over + +=item * class + +=item * witness (for 'path' class edges) + +=item * extra (for 'path' class edges) + +=item * relationship (for 'relationship' class edges) + +=item * equal_rank (for 'relationship' class edges) + +=item * non_correctable (for 'relationship' class edges) + +=item * non_independent (for 'relationship' class edges) + +=back + +=head1 METHODS + +=head2 B + +parse( $graph, $opts ); + +Takes an initialized Text::Tradition object and a set of options; creates +the appropriate nodes and edges on the graph. The options hash should +include either a 'file' argument or a 'string' argument, depending on the +source of the XML to be parsed. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $tradition = 't/data/florilegium_graphml.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'Self', + 'file' => $tradition, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed GraphML version 2" ); +if( $t ) { + is( scalar $t->collation->readings, 319, "Collation has all readings" ); + is( scalar $t->collation->paths, 376, "Collation has all paths" ); + is( scalar $t->witnesses, 13, "Collation has all witnesses" ); +} +# TODO add a relationship, add a stemma, write graphml, reparse it, check that +# the new data is there +$t->language('Greek'); +$t->add_stemma( 'dotfile' => 't/data/florilegium.dot' ); +$t->collation->add_relationship( 'w12', 'w13', + { 'type' => 'grammatical', 'scope' => 'global', + 'annotation' => 'This is some note' } ); +ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" ); +my $graphml_str = $t->collation->as_graphml; + +my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str ); +is( ref( $newt ), 'Text::Tradition', "Parsed current GraphML version" ); +if( $newt ) { + is( scalar $newt->collation->readings, 319, "Collation has all readings" ); + is( scalar $newt->collation->paths, 376, "Collation has all paths" ); + is( scalar $newt->witnesses, 13, "Collation has all witnesses" ); + is( scalar $newt->collation->relationships, 1, "Collation has added relationship" ); + is( $newt->language, 'Greek', "Tradition has correct language setting" ); + my $rel = $newt->collation->get_relationship( 'w12', 'w13' ); + ok( $rel, "Found set relationship" ); + is( $rel->annotation, 'This is some note', "Relationship has its properties" ); + is( scalar $newt->stemmata, 1, "Tradition has its stemma" ); + is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" ); +} + + +=end testing + +=cut + +sub parse { + my( $tradition, $opts ) = @_; + + # Collation data is in the first graph; relationship-specific stuff + # is in the second. + my( $graph_data, $rel_data ) = graphml_parse( $opts ); + my $collation = $tradition->collation; my %witnesses; - - # Add the nodes to the graph. - - my $extra_data = {}; # Keep track of data that needs to be processed - # after the nodes & edges are created. - foreach my $n ( @{$graph_data->{'nodes'}} ) { - # Could use a better way of registering these - my %node_data = %$n; - my $nodeid = delete $node_data{$IDKEY}; - my $reading = delete $node_data{$TOKENKEY}; - my $gnode = $collation->add_reading( $nodeid ); - $gnode->text( $reading ); - - # Now save the rest of the data, i.e. not the ID or label, - # if it exists. - if ( keys %node_data ) { - $extra_data->{$nodeid} = \%node_data; + + # print STDERR "Setting graph globals\n"; + $tradition->name( $graph_data->{'name'} ); + my $use_version; + my $tmeta = $tradition->meta; + my $cmeta = $collation->meta; + foreach my $gkey ( keys %{$graph_data->{'global'}} ) { + my $val = $graph_data->{'global'}->{$gkey}; + if( $gkey eq 'version' ) { + $use_version = $val; + } elsif( $gkey eq 'stemmata' ) { # Special case, yuck + foreach my $dotstr ( split( /\n/, $val ) ) { + $tradition->add_stemma( 'dot' => $dotstr ); + } + } elsif( $tmeta->has_attribute( $gkey ) ) { + $tradition->$gkey( $val ); + } else { + $collation->$gkey( $val ); + } } + + # Add the nodes to the graph. + # Note any reading IDs that were changed in order to comply with XML + # name restrictions; we have to hardcode start & end. + my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' ); + + # print STDERR "Adding collation readings\n"; + foreach my $n ( @{$graph_data->{'nodes'}} ) { + # If it is the start or end node, we already have one, so + # grab the rank and go. + next if( defined $n->{'is_start'} ); + if( defined $n->{'is_end'} ) { + $collation->end->rank( $n->{'rank'} ); + next; + } + my $gnode = $collation->add_reading( $n ); + if( $gnode->id ne $n->{'id'} ) { + $namechange{$n->{'id'}} = $gnode->id; + } } - + # Now add the edges. + # print STDERR "Adding collation path edges\n"; foreach my $e ( @{$graph_data->{'edges'}} ) { - my %edge_data = %$e; - my $from = delete $edge_data{'source'}; - my $to = delete $edge_data{'target'}; - - # Whatever is left tells us what kind of edge it is. - foreach my $wkey ( keys %edge_data ) { - if( $wkey =~ /^witness/ ) { - my $wit = $edge_data{$wkey}; - unless( $witnesses{$wit} ) { - $tradition->add_witness( sigil => $wit ); - $witnesses{$wit} = 1; + my $sourceid = exists $namechange{$e->{'source'}->{'id'}} + ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; + my $targetid = exists $namechange{$e->{'target'}->{'id'}} + ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; + my $from = $collation->reading( $sourceid ); + my $to = $collation->reading( $targetid ); + + warn "No witness label on path edge!" unless $e->{'witness'}; + my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' ); + $collation->add_path( $from, $to, $label ); + + # Add the witness if we don't have it already. + unless( $witnesses{$e->{'witness'}} ) { + $tradition->add_witness( + sigil => $e->{'witness'}, 'sourcetype' => 'collation' ); + $witnesses{$e->{'witness'}} = 1; } - my $label = $wkey eq 'witness_ante_corr' - ? $wit . $collation->ac_label : $wit; - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); - } else { - my $rel = $edge_data{$wkey}; - # TODO handle global relationships - $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} ); - } - } + $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'}; } - - ## Deal with node information (transposition, relationships, etc.) that - ## needs to be processed after all the nodes are created. - foreach my $nkey ( keys %$extra_data ) { - foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { - my $this_reading = $collation->reading( $nkey ); - if( $edkey eq $TRANSPOS_KEY ) { - my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); - if( $collation->linear ) { - $this_reading->set_identical( $other_reading ); - } else { - $collation->merge_readings( $other_reading, $this_reading ); + + ## Done with the main graph, now look at the relationships. + # Nodes are added via the call to add_reading above. We only need + # add the relationships themselves. + # TODO check that scoping does trt + $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels + foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) { + my $sourceid = exists $namechange{$e->{'source'}->{'id'}} + ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; + my $targetid = exists $namechange{$e->{'target'}->{'id'}} + ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; + my $from = $collation->reading( $sourceid ); + my $to = $collation->reading( $targetid ); + delete $e->{'source'}; + delete $e->{'target'}; + # The remaining keys are relationship attributes. + # Backward compatibility... + if( $use_version eq '2.0' || $use_version eq '3.0' ) { + delete $e->{'class'}; + $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'}; + } + # Add the specified relationship unless we already have done. + my $rel_exists; + if( $e->{'scope'} ne 'local' ) { + my $relobj = $collation->get_relationship( $from, $to ); + if( $relobj && $relobj->scope eq $e->{'scope'} + && $relobj->type eq $e->{'type'} ) { + $rel_exists = 1; + } + } + try { + $collation->add_relationship( $from, $to, $e ) unless $rel_exists; + } catch( Text::Tradition::Error $e ) { + warn "DROPPING $from -> $to: " . $e->message; } - } elsif ( $edkey eq $POSITION_KEY ) { - $this_reading->position( $extra_data->{$nkey}->{$edkey} ); - } else { - warn "Unfamiliar reading node data $edkey for $nkey"; - } } - } + + # Save the text for each witness so that we can ensure consistency + # later on + $collation->text_from_paths(); +} - # We know what the beginning and ending nodes are, no need to - # search or reset. - my $end_node = $collation->reading( '#END#' ); - $DB::single = 1; - # Walk the paths and make reading sequences for our witnesses. - $collation->walk_witness_paths( $end_node ); +## Return the relationship that comes first in priority. +my %LAYERS = ( + 'collated' => 1, + 'orthographic' => 2, + 'spelling' => 3, + ); + +sub _layersort_rel { + my( $a, $b ) = @_; + my $key = exists $a->{'type'} ? 'type' : 'relationship'; + my $at = $LAYERS{$a->{$key}} || 99; + my $bt = $LAYERS{$b->{$key}} || 99; + return $at <=> $bt; } +1; + +=head1 BUGS / TODO + +=over + +=item * Make this into a stream parser with GraphML + +=item * Simply field -> attribute correspondence for nodes and edges + +=item * Share key name constants with Collation.pm + =back =head1 LICENSE @@ -119,8 +295,4 @@ the same terms as Perl itself. =head1 AUTHOR -Tara L Andrews, aurum@cpan.org - -=cut - -1; +Tara L Andrews Eaurum@cpan.orgE