From: Tara L Andrews Date: Thu, 21 Apr 2011 15:15:52 +0000 (+0200) Subject: Add some documentation X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2ceca8c3211427df102bec429c6b6397dda8f9b4;p=scpubgit%2Fstemmatology.git Add some documentation --- diff --git a/lib/Text/Tradition/Graph.pm b/lib/Text/Tradition/Graph.pm index 125eded..f49db42 100644 --- a/lib/Text/Tradition/Graph.pm +++ b/lib/Text/Tradition/Graph.pm @@ -13,19 +13,19 @@ Text::Tradition::Graph =head1 SYNOPSIS -use Text::Tradition::Graph; + use Text::Tradition::Graph; -my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' ); -my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' ); -my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv', - 'base' => '/my/basefile.txt' ); -my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt', - 'base' => '/my/basefile.txt' ); + my $text = Text::Tradition::Graph->new( 'GraphML' => '/my/graphml/file.xml' ); + my $text = Text::Tradition::Graph->new( 'TEI' => '/my/tei/file.xml' ); + my $text = Text::Tradition::Graph->new( 'CSV' => '/my/csv/file.csv', + 'base' => '/my/basefile.txt' ); + my $text = Text::Tradition::Graph->new( 'CTE' => '/my/cte/file.txt', + 'base' => '/my/basefile.txt' ); -my $svg_string = $text->as_svg(); + my $svg_string = $text->as_svg(); -my $lemma_nodes = $text->active_nodes(); -$text->toggle_node( 'some_word' ); + my $lemma_nodes = $text->active_nodes(); + $text->toggle_node( 'some_word' ); =head1 DESCRIPTION @@ -58,10 +58,13 @@ others. Constructor. Takes a source collation file from which to construct the initial graph. This file can be TEI (parallel segmentation) XML, -CSV in a format yet to be documented, GraphML as documented (someday) -by CollateX, or a Classical Text Editor apparatus. For CSV and -Classical Text Editor files, the user must also supply a base text to -which the line numbering in the collation file refers. +CSV in a format yet to be documented, GraphML as documented by the +CollateX tool (L), or a Classical +Text Editor apparatus. For CSV and Classical Text Editor files, the +user must also supply a base text to which the line numbering in the +collation file refers. + +20/04/2011 Currently only CSV and GraphML are really supported. =cut @@ -107,20 +110,55 @@ sub new { return $self; } +=item B + +$graph->make_positions( $common_nodes, $paths ) + +Create an associated Graph::Positions object that records the position +of each node in the graph. This method call is probably in the wrong +place and will move. + +=cut + +sub make_positions { + my( $self, $common_nodes, $paths ) = @_; + my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths ); + $self->{'positions'} = $positions; +} + +=back + +=head2 Graph::Easy object accessor methods + +See the Graph::Easy documentation for descriptions of these functions. + +=over + +=item B + +=cut -### Graph::Easy object accessor methods sub node { my $self = shift; return $self->{'graph'}->node( @_ ); } +=item B + +=cut + sub edge { my $self = shift; return $self->{'graph'}->edge( @_ ); } +=item B + +=cut + # Not only adds the node, but also initializes internal data # about the node. + sub add_node { my $self = shift; my $node = $self->{'graph'}->add_node( @_ ); @@ -128,11 +166,19 @@ sub add_node { return $node; } +=item B + +=cut + sub add_edge { my $self = shift; return $self->{'graph'}->add_edge( @_ ); } +=item B + +=cut + sub del_node { my $self = shift; my $node = $_[0]; @@ -158,21 +204,37 @@ sub del_node { return $self->{'graph'}->del_node( @_ ); } +=item B + +=cut + sub del_edge { my $self = shift; return $self->{'graph'}->del_edge( @_ ); } +=item B + +=cut + sub nodes { my $self = shift; return $self->{'graph'}->nodes( @_ ); } +=item B + +=cut + sub edges { my $self = shift; return $self->{'graph'}->edges( @_ ); } +=item B + +=cut + sub merge_nodes { my $self = shift; return $self->{'graph'}->merge_nodes( @_ ); @@ -180,6 +242,20 @@ sub merge_nodes { ### Helper methods for navigating the tree +=back + +=head2 Graph navigation methods + +=over + +=item B + +my $node = $graph->start(); + +Returns the beginning node of the graph. + +=cut + sub start { # Return the beginning node of the graph. my $self = shift; @@ -193,30 +269,14 @@ sub start { return $self->{'graph'}->node('#START#'); } -# Record that nodes A and B are really the same (transposed) node. -# We do this by maintaining some pools of transposed nodes, and -# we have a lookup hash so that each member of that -sub set_identical_node { - my( $self, $node, $same_node ) = @_; - my $pool = $self->{'identical_nodes'}->{ $node }; - my $same_pool = $self->{'identical_nodes'}->{ $same_node }; - my %poolhash; - foreach ( @$pool ) { - $poolhash{$_} = 1; - } - foreach( @$same_pool ) { - push( @$pool, $_ ) unless $poolhash{$_}; - } +=item B - $self->{'identical_nodes'}->{ $same_node } = $pool; -} +my $next_node = $graph->next_word( $node, $path ); -sub identical_nodes { - my( $self, $node ) = @_; - my @others = grep { $_ !~ /^$node$/ } - @{$self->{'identical_nodes'}->{ $node }}; - return @others; -} +Returns the node that follows the given node along the given witness +path. TODO These are badly named. + +=cut sub next_word { # Return the successor via the corresponding edge. @@ -234,6 +294,15 @@ sub next_word { return undef; } +=item B + +my $prior_node = $graph->prior_word( $node, $path ); + +Returns the node that precedes the given node along the given witness +path. TODO These are badly named. + +=cut + sub prior_word { # Return the predecessor via the corresponding edge. my( $self, $node, $edge ) = @_; @@ -250,6 +319,15 @@ sub prior_word { return undef; } +=item B + +my @nodes = $graph->node_sequence( $first, $last, $path ); + +Returns the ordered list of nodes, starting with $first and ending +with $last, along the given witness path. + +=cut + sub node_sequence { my( $self, $start, $end, $label ) = @_; # TODO make label able to follow a single MS @@ -288,6 +366,15 @@ sub node_sequence { return @nodes; } +=item B + +my $text = $graph->string_lemma( $first, $last, $path ); + +Returns the whitespace-separated text, starting with $first and ending +with $last, represented in the graph along the given path. + +=cut + sub string_lemma { my( $self, $start, $end, $label ) = @_; @@ -296,8 +383,71 @@ sub string_lemma { return join( ' ', @words ); } -## Output. We use GraphViz for the layout because it handles large -## graphs better than Graph::Easy does natively. +=back + +=head2 Transposition handling methods + +These should really move to their own module. For use when the graph +has split transposed nodes in order to avoid edges that travel +backward. + +=over + +=item B + +$graph->set_identical_node( $node, $other_node ) + +Tell the graph that these two nodes contain the same (transposed) reading. + +=cut + +sub set_identical_node { + my( $self, $node, $same_node ) = @_; + my $pool = $self->{'identical_nodes'}->{ $node }; + my $same_pool = $self->{'identical_nodes'}->{ $same_node }; + my %poolhash; + foreach ( @$pool ) { + $poolhash{$_} = 1; + } + foreach( @$same_pool ) { + push( @$pool, $_ ) unless $poolhash{$_}; + } + + $self->{'identical_nodes'}->{ $same_node } = $pool; +} + +=item B + +my @nodes = $graph->identical_nodes( $node ) + +Get a list of nodes that contain the same (transposed) reading as the +given node. + +=cut + +sub identical_nodes { + my( $self, $node ) = @_; + my @others = grep { $_ !~ /^$node$/ } + @{$self->{'identical_nodes'}->{ $node }}; + return @others; +} + +=back + +=head2 Output method(s) + +=over + +=item B + +print $graph->as_svg( $recalculate ); + +Returns an SVG string that represents the graph. Uses GraphViz to do +this, because Graph::Easy doesn't cope well with long graphs. Unless +$recalculate is passed (and is a true value), the method will return a +cached copy of the SVG after the first call to the method. + +=cut sub as_svg { my( $self, $recalc ) = @_; @@ -314,7 +464,15 @@ sub as_svg { return $svg; } -## Methods for lemmatizing a text. +=back + +=head2 Lemmatization methods + +=over + +=item B + +=cut sub init_lemmatizer { my $self = shift; @@ -329,12 +487,6 @@ sub init_lemmatizer { } -sub make_positions { - my( $self, $common_nodes, $paths ) = @_; - my $positions = Text::Tradition::Graph::Position->new( $common_nodes, $paths ); - $self->{'positions'} = $positions; -} - # Takes a list of nodes that have just been turned off, and returns a # set of tuples of the form ['node', 'state'] that indicates what # changes need to be made to the graph. @@ -488,5 +640,19 @@ sub unique_list { return keys( %h ); } +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews, aurum@cpan.org + +=cut + 1; diff --git a/lib/Text/Tradition/Graph/Position.pm b/lib/Text/Tradition/Graph/Position.pm index 3544bfb..d9f023d 100644 --- a/lib/Text/Tradition/Graph/Position.pm +++ b/lib/Text/Tradition/Graph/Position.pm @@ -10,7 +10,9 @@ Text::Tradition::Graph::Position =head1 SUMMARY An object to go with a text graph that keeps track of relative -positions of the nodes. +positions of the nodes on that graph. This is useful for keeping +track of which readings are variants of each other, which is expensive +to calculate every time from the graph itself. =head1 METHODS @@ -24,6 +26,8 @@ node based on this. =cut +# TODO Why not just hand over the graph and calculate the common nodes +# and witness paths here? sub new { my $proto = shift; my( $common_nodes, $witness_paths ) = @_; @@ -97,6 +101,14 @@ sub new { return $self; } +=item B + +my $pos = $positions->node_position( $node ); + +Returns the position identifier for a given node in the graph. + +=cut + sub node_position { my( $self, $node ) = @_; $node = _name( $node ); @@ -109,10 +121,18 @@ sub node_position { return $self->{'node_positions'}->{ $node }; } +=item B + +my @nodes = $positions->nodes_at_position( $pos ); + +Returns the names of all the nodes in the graph at a given position. + +=cut + sub nodes_at_position { my( $self, $pos ) = @_; - my $positions = $self->calc_positions(); + my $positions = $self->_calc_positions(); unless( exists $positions->{ $pos } ) { warn "No position $pos in the graph"; return; @@ -120,6 +140,15 @@ sub nodes_at_position { return @{ $positions->{ $pos }}; } +=item B + +my @nodes = $positions->colocated_nodes( $node ); + +Returns the names of all the nodes in the graph at the same position +as the node given, apart from that node itself. + +=cut + sub colocated_nodes { my( $self, $node ) = @_; $node = _name( $node ); @@ -130,26 +159,47 @@ sub colocated_nodes { return @cn; } -# Returns an ordered list of positions in this graph +=item B + +my @position_list = $positions->all() + +Returns an ordered list of positions in the graph. + +=cut + sub all { my( $self ) = @_; - my $pos = $self->calc_positions; + my $pos = $self->_calc_positions; return sort by_position keys( %$pos ); } -# Returns undef if no decision has been taken on this position, the -# node name if there is a lemma for it, and 0 if there is no lemma for -# it. -sub state { - my( $self, $pos ) = @_; - return $self->{'position_state'}->{ $pos }; +sub witness_path { + my( $self, $wit ) = @_; + return @{$self->{'witness_paths'}->{ $wit }}; } -sub set_state { - my( $self, $pos, $state ) = @_; - $self->{'position_state'}->{ $pos } = $state; -} +=back + +=head2 Lemmatization functions + +For some traditions, each position will have at least one node that is +the 'lemma text', that is, the text that an editor has chosen to stand +as authoritative for the tradition. The following methods keep +track of what lemma, if any, should stand at each position. + +=over +=item B + +$positions->init_lemmatizer( @nodelist ) + +Sets up the necessary logic for keeping track of lemmas. It should be +called once, with the initial list of lemmas. + +=cut + +# TODO We can initialize this without the argument, based on the passed +# list of common nodes. sub init_lemmatizer { my( $self, @nodes ) = @_; foreach my $n ( @nodes ) { @@ -157,30 +207,50 @@ sub init_lemmatizer { } } -sub witness_path { - my( $self, $wit ) = @_; - return @{$self->{'witness_paths'}->{ $wit }}; -} +=item B -# At some point I may find myself using scalar references for the node -# positions, in order to keep them easily in sync. Just in case, I will -# calculate this every time I need it. -sub calc_positions { - my $self = shift; - return _invert_hash( $self->{'node_positions'} ) +my $answer = $positions->state( $position_id ) + +For the given position ID, returns the node (if any) that stands at +the lemma. If no node should stand as lemma at this position, returns +0; if no decision has been made for this position, returns undef. + +=cut + +sub state { + my( $self, $pos ) = @_; + return $self->{'position_state'}->{ $pos }; } -# Helper for dealing with node refs -sub _name { - my( $node ) = @_; - # We work with node names in this library - if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) { - $node = $node->name(); - } - return $node; +=item B + +$positions->set_state( $position_id, $state ) + +For the given position ID, sets the lemma (if any). State can be the +name of a node, 0 (for cases when no lemma should stand), or undef +(for cases when no decision has been made). + +=cut + +sub set_state { + my( $self, $pos, $state ) = @_; + $self->{'position_state'}->{ $pos } = $state; } -### Comparison functions +=back + +=head2 Comparison function + +=over + +=item B + +my @nodelist = sort $positions->by_position @nodelist; + +For use in the 'sort' function. Returns a comparison value based on +the position of the given nodes. + +=cut # Compares two nodes according to their positions in the witness # index hash. @@ -201,6 +271,27 @@ sub _cmp_position { return $pos_a[1] <=> $pos_b[1]; } + +#### HELPER FUNCTIONS #### + +# At some point I may find myself using scalar references for the node +# positions, in order to keep them easily in sync. Just in case, I will +# calculate this every time I need it. +sub _calc_positions { + my $self = shift; + return _invert_hash( $self->{'node_positions'} ) +} + +# Helper for dealing with node refs +sub _name { + my( $node ) = @_; + # We work with node names in this library + if( ref( $node ) && ref( $node ) eq 'Graph::Easy::Node' ) { + $node = $node->name(); + } + return $node; +} + # Useful helper. Will be especially useful if I find myself using # scalar references for the positions after all - it can dereference # them here. @@ -224,4 +315,18 @@ sub _invert_hash { return \%new_hash; } +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews, aurum@cpan.org + +=cut + 1; diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index e1867e9..b9242c3 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -6,6 +6,57 @@ use Exporter 'import'; use vars qw( @EXPORT_OK ); @EXPORT_OK = qw( merge_base ); +=head1 NAME + +Text::Tradition::Parser::BaseText + +=head1 SYNOPSIS + +use Text::Tradition::Parser::BaseText qw( merge_base ); +merge_base( $graph, 'reference.txt', @apparatus_entries ) + +=head1 DESCRIPTION + +For an overview of the package, see the documentation for the +Text::Tradition::Graph module. + +This module is meant for use with certain of the other Parser classes +- whenever a list of variants is given with reference to a base text, +these must be joined into a single collation. The parser should +therefore make a list of variants and their locations, and BaseText +will join those listed variants onto the reference text. + +=head1 SUBROUTINES + +=over + +=item B + +merge_base( $graph, 'reference.txt', @apparatus_entries ) + +Takes three arguments: a newly-initialized Text::Tradition::Graph +object, a text file containing the reference text, and a list of +variants (apparatus entries). Adds the base text to the graph, and +joins the variants to that. + +The list of variants is an array of hash references; each hash takes +the form + { '_id' => line reference, + 'rdg_0' => lemma reading, + 'rdg_1' => first variant, + ... # and so on until all distinct readings are listed + 'WitnessA' => 'rdg_0', + 'WitnessB' => 'rdg_1', + ... # and so on until all witnesses are listed with their readings + } + +Any hash key that is not of the form /^rdg_\d+$/ and that does not +begin with an underscore is assumed to be a witness name. Any 'meta' +information to be passed must be passed in a key with a leading +underscore in its name. + +=cut + sub merge_base { my( $graph, $base_file, @app_entries ) = @_; my @base_line_starts = read_base( $base_file, $graph ); @@ -14,7 +65,8 @@ sub merge_base { my( $line, $num ) = split( /\./, $app->{_id} ); # DEBUG with a short graph # last if $line > 2; - my $scrutinize = "21.8"; + # DEBUG for problematic entries + # my $scrutinize = "21.8"; my $first_line_node = $base_line_starts[ $line ]; my $too_far = $base_line_starts[ $line+1 ]; @@ -146,11 +198,17 @@ sub merge_base { } } -# read_base: Takes a text file and a (presumed empty) graph object, -# adds the words as simple linear nodes to the graph, and returns a -# list of nodes that represent the beginning of lines. This graph is -# now the starting point for application of apparatus entries in -# merge_base, e.g. from a CSV file or a CTE file. +=item B + +my @line_beginnings = read_base( 'reference.txt', $graph ); + +Takes a text file and a (presumed empty) graph object, adds the words +as simple linear nodes to the graph, and returns a list of nodes that +represent the beginning of lines. This graph is now the starting point +for application of apparatus entries in merge_base, e.g. from a CSV +file or a Classical Text Editor file. + +=cut sub read_base { my( $base_file, $graph ) = @_; @@ -193,8 +251,18 @@ sub read_base { return( @$lineref_array ); } +=item B + +collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end ); + +Given a lemma and a variant as start- and endpoints on the graph, +walks through each to identify those nodes that are identical. The +graph is a Text::Tradition::Graph object; the other arguments are +Graph::Easy::Node objects that appear on the graph. -## Helper methods for merge_base +TODO: Handle collapsed and non-collapsed transpositions. + +=cut sub collate_variant { my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_; @@ -246,6 +314,15 @@ sub collate_variant { } } +=item B + +remove_duplicate_edges( $graph, $from, $to ); + +Given two nodes, reduce the number of edges between those nodes to +one. If neither edge represents a base text, combine their labels. + +=cut + sub remove_duplicate_edges { my( $graph, $from, $to ) = @_; my @edges = $from->edges_to( $to ); @@ -269,7 +346,12 @@ sub remove_duplicate_edges { } } -# TODO need to make this configurable! +=item B + +Pretend you never saw this method. Really it needs to not be hardcoded. + +=cut + sub cmp_str { my( $node ) = @_; my $word = $node->label(); @@ -283,4 +365,18 @@ sub cmp_str { return $word; } +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews, aurum@cpan.org + +=cut + 1; diff --git a/lib/Text/Tradition/Parser/CSV.pm b/lib/Text/Tradition/Parser/CSV.pm index d3d19ca..a7e3ea4 100644 --- a/lib/Text/Tradition/Parser/CSV.pm +++ b/lib/Text/Tradition/Parser/CSV.pm @@ -5,13 +5,34 @@ use warnings; use Text::CSV::Simple; use Text::Tradition::Parser::BaseText qw( merge_base ); -# Takes a CSV file and a base text; returns a GraphML object. +=head1 NAME + +Text::Tradition::Parser::CSV + +=head1 DESCRIPTION + +Parser module for Text::Tradition, given a list of variants as a CSV +file and a reference text as a plaintext file with appropriate line +breaks. + +=head1 METHODS + +=over + +=item B + +parse( $graph, 'variants.csv', 'reference.txt' ); + +Takes an initialized Text::Tradition::Graph object and the relevant +data files; puts the text and its variants onto the graph. + +=cut sub parse { my( $graph, $csv_file, $base_text ) = @_; # Parse the CSV file into a list of apparatus entries. - my @app_list = read_csv( $csv_file ); + my @app_list = _read_csv( $csv_file ); # Now put the base text onto the graph, and merge in the # apparatus entries. merge_base( $graph, $base_text, @app_list ); @@ -20,7 +41,7 @@ sub parse { # Takes a CSV file; returns a data structure of apparatus entries to # be merged with a base text. -sub read_csv { +sub _read_csv { my( $csv_file ) = @_; my $parser = Text::CSV::Simple->new(); my @fields = qw/ reference text variant type context non_corr non_indep @@ -93,5 +114,19 @@ sub read_csv { return @app_list; } +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews, aurum@cpan.org + +=cut + 1; diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index f36bea2..19b294c 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -5,8 +5,29 @@ use warnings; use XML::LibXML; use XML::LibXML::XPathContext; +=head1 NAME -# Takes a GraphML string; returns a Graph::Easy object. +Text::Tradition::Parser::GraphML + +=head1 DESCRIPTION + +Parser module for Text::Tradition, given a GraphML file that describes +a collation graph. For further information on the GraphML format for +text collation, see http://gregor.middell.net/collatex/ + +=head1 METHODS + +=over + +=item B + +parse( $graph, $graphml_string ); + +Takes an initialized Text::Tradition::Graph object and a string +containing the GraphML; creates the appropriate nodes and edges on the +graph. + +=cut sub parse { my( $graph, $graphml_str ) = @_; @@ -166,4 +187,18 @@ sub parse { } +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews, aurum@cpan.org + +=cut + 1;