From: Tara L Andrews Date: Mon, 3 Oct 2011 21:15:48 +0000 (+0200) Subject: continued doc and testing drive; rationalize GraphML a little X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e867486f69f12dc06304594022c298935d1c7fb9;p=scpubgit%2Fstemmatology.git continued doc and testing drive; rationalize GraphML a little --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 1ef3985..7606cc2 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -1278,3 +1278,13 @@ sub add_hash_entry { no Moose; __PACKAGE__->meta->make_immutable; + +=head1 BUGS / TODO + +=over + +=item * Rationalize edge classes + +=item * Port the internal graph from Graph::Easy to Graph + +=back diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 2ab0546..474d3ff 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -2,12 +2,28 @@ package Text::Tradition::Parser::CollateX; use strict; use warnings; -use Text::Tradition::Parser::GraphML; +use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /; =head1 NAME Text::Tradition::Parser::CollateX +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'CollateX', + 'file' => '/path/to/collation.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'CollateX', + 'string' => $collation_xml, + ); + =head1 DESCRIPTION Parser module for Text::Tradition, given a GraphML file from the @@ -17,15 +33,42 @@ http://gregor.middell.net/collatex/ =head1 METHODS -=over - -=item B +=head2 B parse( $tradition, $init_options ); -Takes an initialized Text::Tradition::Graph object and its initialization -options, including the data source; creates the appropriate nodes and edges -on the graph. +Takes an initialized Text::Tradition object and a set of options; creates +the appropriate nodes and edges on the graph. The options hash should +include either a 'file' argument or a 'string' argument, depending on the +source of the XML to be parsed. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $cxfile = 't/data/Collatex-16.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +if( $t ) { + is( scalar $t->collation->readings, 26, "Collation has all readings" ); + is( scalar $t->collation->paths, 49, "Collation has all paths" ); + is( scalar $t->witnesses, 3, "Collation has all witnesses" ); + + # Check an 'identical' node + my $transposed = $t->collation->reading( 'n15' ); + ok( $transposed->has_primary, "Reading links to transposed primary" ); + is( $transposed->primary->name, 'n17', "Correct transposition link" ); +} + +=end testing =cut @@ -35,7 +78,7 @@ my $TRANSKEY = 'identical'; sub parse { my( $tradition, $opts ) = @_; - my $graph_data = Text::Tradition::Parser::GraphML::parse( $opts ); + my $graph_data = graphml_parse( $opts ); my $collation = $tradition->collation; my %witnesses; # Keep track of the witnesses we encounter as we # run through the graph data. @@ -120,14 +163,21 @@ sub parse { } } - # TODO Need to populate $wit->path / uncorrected_path + # Set the $witness->path arrays for each wit. + populate_witness_path( $tradition ); - # Now we have added the witnesses and their paths, so we can - # calculate their explicit positions. - # TODO CollateX does this, and we should just have it exported there. + # Rank the readings. $collation->calculate_ranks(); } +=head1 BUGS / TODO + +=over + +=item * Make this into a stream parser with GraphML + +=item * Use CollateX-calculated ranks instead of recalculating our own + =back =head1 LICENSE diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index c29f78a..3ecd936 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -2,9 +2,14 @@ package Text::Tradition::Parser::GraphML; use strict; use warnings; +use Exporter 'import'; +use vars qw/ @EXPORT_OK $xpc /; + use XML::LibXML; use XML::LibXML::XPathContext; +@EXPORT_OK = qw/ graphml_parse populate_witness_path /; + =head1 NAME Text::Tradition::Parser::GraphML @@ -19,9 +24,7 @@ GraphML conventions of the source program. =head1 METHODS -=over - -=item B +=head2 B( $init_opts ) parse( $init_opts ); @@ -31,17 +34,15 @@ and their associated data. =cut -use vars qw/ $xpc $graphattr $nodedata $witnesses /; - # Return graph -> nodeid -> { key1/val1, key2/val2, key3/val3 ... } # -> edgeid -> { source, target, wit1/val1, wit2/val2 ...} -sub parse { +sub graphml_parse { my( $opts ) = @_; my $graph_hash = { 'nodes' => [], 'edges' => [] }; - + my $parser = XML::LibXML->new(); my $doc; if( exists $opts->{'string'} ) { @@ -53,6 +54,7 @@ sub parse { return; } + my( $graphattr, $nodedata, $witnesses ) = ( {}, {}, {} ); my $graphml = $doc->documentElement(); $xpc = XML::LibXML::XPathContext->new( $graphml ); $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' ); @@ -127,6 +129,29 @@ sub parse { return $graph_hash; } +=head2 B( $tradition ) + +Given a tradition, populate the 'path' and 'uncorrected_path' attributes +of all of its witnesses. Useful for all formats based on the graph itself. + +=cut + +sub populate_witness_path { + my ( $tradition, $ante_corr ) = @_; + my $c = $tradition->collation; + print STDERR "Walking paths for witnesses\n"; + foreach my $wit ( $tradition->witnesses ) { + my @path = $c->reading_sequence( $c->start, $c->end, $wit->sigil ); + $wit->path( \@path ); + if( $ante_corr->{$wit->sigil} ) { + # Get the uncorrected path too + my @uc = $c->reading_sequence( $c->start, $c->end, + $wit->sigil . $c->ac_label, $wit->sigil ); + $wit->uncorrected_path( \@uc ); + } + } +} + sub _lookup_node_data { my( $xmlnode, $key ) = @_; my $lookup_xpath = './g:data[@key="%s"]/child::text()'; diff --git a/lib/Text/Tradition/Parser/KUL.pm b/lib/Text/Tradition/Parser/KUL.pm index 0469be6..1ac3b9a 100644 --- a/lib/Text/Tradition/Parser/KUL.pm +++ b/lib/Text/Tradition/Parser/KUL.pm @@ -7,19 +7,18 @@ use Text::CSV::Simple; # TODO convert to CSV_XS =head1 NAME -Text::Tradition::Parser::CSV +Text::Tradition::Parser::KUL =head1 DESCRIPTION Parser module for Text::Tradition, given a list of variants as a CSV file and a reference text as a plaintext file with appropriate line -breaks. +breaks. The CSV file is a specialized format developed at KU Leuven, +and other formats are vastly preferred. =head1 METHODS -=over - -=item B +=head2 B my @apparatus = read( $csv_file ); @@ -114,8 +113,6 @@ sub read { return @app_list; } -=back - =head1 LICENSE This package is free software and is provided "as is" without express diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 0c4b83f..95ec304 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -2,32 +2,121 @@ package Text::Tradition::Parser::Self; use strict; use warnings; -use Text::Tradition::Parser::GraphML; +use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /; =head1 NAME Text::Tradition::Parser::GraphML +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Self', + 'file' => '/path/to/tradition.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Self', + 'string' => $tradition_xml, + ); + =head1 DESCRIPTION Parser module for Text::Tradition to read in its own GraphML output format. -TODO document what this format is. +GraphML is a relatively simple graph description language; a 'graph' element +can have 'node' and 'edge' elements, and each of these can have simple 'data' +elements for attributes to be saved. -=head1 METHODS +The graph itself has attributes as in the Collation object: + +=over + +=item * linear + +=item * ac_label + +=item * baselabel + +=item * wit_list_separator + +=back + +The node objects have the following attributes: =over -=item B +=item * name + +=item * reading + +=item * identical + +=item * rank + +=item * class + +=back + +The edge objects have the following attributes: + +=over + +=item * class + +=item * witness (for 'path' class edges) + +=item * extra (for 'path' class edges) + +=item * relationship (for 'relationship' class edges) + +=item * equal_rank (for 'relationship' class edges) -parse( $graph, $graphml_string ); +=item * non_correctable (for 'relationship' class edges) -Takes an initialized Text::Tradition::Graph object and a string -containing the GraphML; creates the appropriate nodes and edges on the -graph. +=item * non_independent (for 'relationship' class edges) + +=back + +=head1 METHODS + +=head2 B + +parse( $graph, $opts ); + +Takes an initialized Text::Tradition object and a set of options; creates +the appropriate nodes and edges on the graph. The options hash should +include either a 'file' argument or a 'string' argument, depending on the +source of the XML to be parsed. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $tradition = 't/data/florilegium_graphml.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'Self', + 'file' => $tradition, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +if( $t ) { + is( scalar $t->collation->readings, 319, "Collation has all readings" ); + is( scalar $t->collation->paths, 2854, "Collation has all paths" ); + is( scalar $t->witnesses, 13, "Collation has all witnesses" ); +} + +=end testing =cut -# TODO share these with Collation.pm somehow my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY, $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY ) = qw/ name reading identical rank class @@ -35,16 +124,13 @@ my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY, sub parse { my( $tradition, $opts ) = @_; - my $graph_data = Text::Tradition::Parser::GraphML::parse( $opts ); + my $graph_data = graphml_parse( $opts ); - # TODO this is begging for stream parsing instead of multiple loops. - my $collation = $tradition->collation; my %witnesses; # Set up the graph-global attributes. They will appear in the # hash under their accessor names. - # TODO Consider simplifying this for nodes & edges as well. print STDERR "Setting graph globals\n"; foreach my $gkey ( keys %{$graph_data->{'attr'}} ) { my $val = $graph_data->{'attr'}->{$gkey}; @@ -52,7 +138,6 @@ sub parse { } # Add the nodes to the graph. - # TODO Are we adding extra start/end nodes? my $extra_data = {}; # Keep track of data that needs to be processed # after the nodes & edges are created. @@ -68,7 +153,6 @@ sub parse { # Create the node. Current valid classes are common and meta. # Everything else is a normal reading. - ## TODO RATIONALIZE THESE CLASSES my $gnode = $collation->add_reading( $nodeid ); $gnode->text( $reading ); $gnode->make_common if $class eq 'common'; @@ -86,6 +170,7 @@ sub parse { # Now add the edges. print STDERR "Adding graph edges\n"; + my $has_ante_corr = {}; foreach my $e ( @{$graph_data->{'edges'}} ) { my $from = $e->{$SOURCE_KEY}; my $to = $e->{$TARGET_KEY}; @@ -104,7 +189,7 @@ sub parse { $tradition->add_witness( sigil => $wit ); $witnesses{$wit} = 1; } - $witnesses{$wit} = 2 if $extra; + $has_ante_corr->{$wit} = 1 if $extra; } elsif( $class eq 'relationship' ) { # We need the metadata about the relationship. my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} }; @@ -135,20 +220,21 @@ sub parse { } # Set the $witness->path arrays for each wit. - print STDERR "Walking paths for witnesses\n"; - foreach my $wit ( $tradition->witnesses ) { - my @path = $collation->reading_sequence( $collation->start, $collation->end, - $wit->sigil ); - $wit->path( \@path ); - if( $witnesses{$wit->sigil} == 2 ) { - # Get the uncorrected path too - my @uc = $collation->reading_sequence( $collation->start, $collation->end, - $wit->sigil . $collation->ac_label, $wit->sigil ); - $wit->uncorrected_path( \@uc ); - } - } + populate_witness_path( $tradition, $has_ante_corr ); } +1; + +=head1 BUGS / TODO + +=over + +=item * Make this into a stream parser with GraphML + +=item * Simply field -> attribute correspondence for nodes and edges + +=item * Share key name constants with Collation.pm + =back =head1 LICENSE @@ -159,8 +245,4 @@ the same terms as Perl itself. =head1 AUTHOR -Tara L Andrews, aurum@cpan.org - -=cut - -1; +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index c7e607d..a86adf7 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -39,9 +39,7 @@ appears in the text body (including elements therein.) =head1 METHODS -=over - -=item B( $tradition, $option_hash ) +=head2 B( $tradition, $option_hash ) Takes an initialized tradition and a set of options; creates the appropriate nodes and edges on the graph, as well as the appropriate @@ -245,12 +243,8 @@ sub _return_rdg { return $real; } -=begin testing - ## TODO test specific sorts of nodes of the parallel-seg XML. -=end testing - ## Recursive helper function to help us navigate through nested XML, ## picking out the text. $tradition is the tradition, needed for ## making readings; $xn is the XML node currently being looked at, @@ -492,6 +486,10 @@ sub _get_sigla { =item * More unit testing +=item * Handle special designations apart from a.c. + +=item * Mark common nodes within collated variants + =back =head1 LICENSE diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index 9daed14..64b1575 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -32,9 +32,7 @@ Parser module for Text::Tradition to read an alignment table format, such as CSV =head1 METHODS -=over - -=item B( $tradition, $option_hash ) +=head2 B( $tradition, $option_hash ) Takes an initialized tradition and a set of options; creates the appropriate nodes and edges on the graph, as well as the appropriate diff --git a/lib/Text/Tradition/Parser/Util.pm b/lib/Text/Tradition/Parser/Util.pm index a1fe4b2..6066d99 100644 --- a/lib/Text/Tradition/Parser/Util.pm +++ b/lib/Text/Tradition/Parser/Util.pm @@ -27,8 +27,6 @@ collation is a Text::Tradition::Collation object; the elements of @readings are Text::Tradition::Collation::Reading objects that appear on the collation graph. -TODO: Handle collapsed and non-collapsed transpositions. - =cut sub collate_variants { diff --git a/t/text_tradition_parser_collatex.t b/t/text_tradition_parser_collatex.t new file mode 100644 index 0000000..6bba0d1 --- /dev/null +++ b/t/text_tradition_parser_collatex.t @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $cxfile = 't/data/Collatex-16.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +if( $t ) { + is( scalar $t->collation->readings, 26, "Collation has all readings" ); + is( scalar $t->collation->paths, 49, "Collation has all paths" ); + is( scalar $t->witnesses, 3, "Collation has all witnesses" ); + + # Check an 'identical' node + my $transposed = $t->collation->reading( 'n15' ); + ok( $transposed->has_primary, "Reading links to transposed primary" ); + is( $transposed->primary->name, 'n17', "Correct transposition link" ); +} +} + + + + +1; diff --git a/t/text_tradition_parser_self.t b/t/text_tradition_parser_self.t new file mode 100644 index 0000000..c8a9bc0 --- /dev/null +++ b/t/text_tradition_parser_self.t @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $tradition = 't/data/florilegium_graphml.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'Self', + 'file' => $tradition, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); +if( $t ) { + is( scalar $t->collation->readings, 319, "Collation has all readings" ); + is( scalar $t->collation->paths, 2854, "Collation has all paths" ); + is( scalar $t->witnesses, 13, "Collation has all witnesses" ); +} +} + + + + +1; diff --git a/t/text_tradition_parser_tei.t b/t/text_tradition_parser_tei.t index 09e9f0b..38c5e0e 100644 --- a/t/text_tradition_parser_tei.t +++ b/t/text_tradition_parser_tei.t @@ -31,13 +31,6 @@ if( $t ) { # =begin testing { -## TODO test specific sorts of nodes of the parallel-seg XML. -} - - - -# =begin testing -{ use XML::LibXML; use XML::LibXML::XPathContext; use Text::Tradition::Parser::TEI;