From: Tara L Andrews Date: Fri, 6 May 2011 13:23:10 +0000 (+0200) Subject: made CSV parser standalone, lots of changes to Base, etc. X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=52ce987f46da555e925cf6d1e159fa9ec413f5b3;p=scpubgit%2Fstemmatology.git made CSV parser standalone, lots of changes to Base, etc. --- diff --git a/lib/Text/Tradition/Graph.pm b/lib/Text/Tradition/Graph.pm index be16099..ee90761 100644 --- a/lib/Text/Tradition/Graph.pm +++ b/lib/Text/Tradition/Graph.pm @@ -81,8 +81,7 @@ sub new { my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts ); my $format = shift( @formats ); unless( $format ) { - warn "No data given to create a graph: need GraphML, CSV, or TEI"; - return; + warn "No data given to create a graph; will initialize an empty one"; } if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) { warn "Cannot make a graph from $format without a base text"; @@ -99,14 +98,20 @@ sub new { bless( $self, $class ); # Now do the parsing. - my $mod = "Text::Tradition::Parser::$format"; - load( $mod ); - my @args = ( $opts{ $format } ); - if( $format =~ /^(CSV|CTE)$/ ) { - push( @args, $opts{'base'} ); + if( $format ) { + my @args; + if( $format =~ /^(CSV|CTE)$/ ) { + @args = ( 'base' => $opts{'base'}, + 'data' => $opts{$format}, + 'format' => $format ); + $format = 'BaseText'; + } else { + @args = ( $opts{ $format } ); + } + my $mod = "Text::Tradition::Parser::$format"; + load( $mod ); + $mod->can('parse')->( $self, @args ); } - $mod->can('parse')->( $self, @args ); - return $self; } @@ -341,14 +346,13 @@ with $last, along the given witness path. =cut sub node_sequence { - my( $self, $start, $end, $label ) = @_; - # TODO make label able to follow a single MS + my( $self, $start, $end, $witness, $backup ) = @_; unless( ref( $start ) eq 'Graph::Easy::Node' && ref( $end ) eq 'Graph::Easy::Node' ) { warn "Called node_sequence without two nodes!"; return (); } - $label = 'base text' unless $label; + $witness = 'base text' unless $witness; my @nodes = ( $start ); my %seen; my $n = $start; @@ -360,11 +364,19 @@ sub node_sequence { $seen{$n->name()} = 1; my @edges = $n->outgoing(); - my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges; - warn "Did not find an edge $label from node " . $n->label + my @relevant_edges = grep { my @w = split( /, /, $_->label ); + grep { /^\Q$witness\E$/ } @w } @edges; + unless( @relevant_edges ) { + @relevant_edges = grep { my @w = split( /, /, $_->label ); + grep { /^\Q$backup\E$/ } @w } @edges + if $backup; + } + unless( @relevant_edges ) { + @relevant_edges = grep { $_->label() eq 'base text' } @edges; + } + + warn "Did not find an edge for $witness from node " . $n->label unless scalar @relevant_edges; - warn "Found more than one edge $label from node " . $n->label - unless scalar @relevant_edges == 1; my $next = $relevant_edges[0]->to(); push( @nodes, $next ); $n = $next; @@ -372,7 +384,7 @@ sub node_sequence { # Check that the last node is our end node. my $last = $nodes[$#nodes]; warn "Last node found from " . $start->label() . - " via path $label is not the end!" + " for witness $witness is not the end!" unless $last eq $end; return @nodes; @@ -514,7 +526,7 @@ sub toggle_node { # In case this is being called for the first time. $self->init_lemmatizer(); - if( $self->is_common( $node ) ) { + if( !$node || $self->is_common( $node ) ) { # Do nothing, it's a common node. return; } @@ -635,8 +647,7 @@ sub active_nodes { return @answer; } -# A couple of helpers. TODO These should be gathered in the same place -# eventually +# A couple of helpers. sub is_common { my( $self, $node ) = @_; diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index 59f5a59..07e9d57 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -2,9 +2,7 @@ package Text::Tradition::Parser::BaseText; use strict; use warnings; -use Exporter 'import'; -use vars qw( @EXPORT_OK ); -@EXPORT_OK = qw( merge_base ); +use Module::Load; =head1 NAME @@ -30,6 +28,26 @@ will join those listed variants onto the reference text. =over +=item B + +parse( $graph, %opts ); + +Takes an initialized graph and a set of options, which must include: +- 'base' - the base text referenced by the variants +- 'format' - the format of the variant list +- 'data' - the variants, in the given format. + +=cut + +sub parse { + my( $graph, %opts ) = @_; + + my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; + load( $format_mod ); + my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); + merge_base( $graph, $opts{'base'}, @apparatus_entries ); +} + =item B merge_base( $graph, 'reference.txt', @apparatus_entries ) @@ -61,6 +79,7 @@ sub merge_base { my( $graph, $base_file, @app_entries ) = @_; my @base_line_starts = read_base( $base_file, $graph ); + my %all_witnesses; foreach my $app ( @app_entries ) { my( $line, $num ) = split( /\./, $app->{_id} ); # DEBUG with a short graph @@ -158,6 +177,7 @@ sub merge_base { # Determine the label name for the edges here. my $edge_name = join(', ', @mss ); + @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); # Make the variant into a set of nodes. my $ctr = 0; @@ -189,19 +209,39 @@ sub merge_base { ## read. So I collapse nodes that have only one edge in and one edge ## out, and I do this by looking at the edges. - foreach my $edge ( $graph->edges() ) { - my @out_edges = $edge->from()->outgoing(); - my @in_edges = $edge->to()->incoming(); +# foreach my $edge ( $graph->edges() ) { +# my @out_edges = $edge->from()->outgoing(); +# my @in_edges = $edge->to()->incoming(); - next unless scalar( @out_edges ) == 1; - next unless scalar( @in_edges ) == 1; - next unless $out_edges[0] eq $in_edges[0]; - # In theory if we've got this far, we're safe, but just to - # double-check... - next unless $out_edges[0] eq $edge; +# next if $edge->from() eq $graph->start(); +# next if $edge->to()->name() eq '#END#'; +# next unless scalar( @out_edges ) == 1; +# next unless scalar( @in_edges ) == 1; +# next unless $out_edges[0] eq $in_edges[0]; +# # In theory if we've got this far, we're safe, but just to +# # double-check... +# next unless $out_edges[0] eq $edge; - $graph->merge_nodes( $edge->from(), $edge->to(), ' ' ); +# $graph->merge_nodes( $edge->from(), $edge->to(), ' ' ); +# } + + # Now walk the path for each witness, so that we can do the + # position calculations. + my $paths = {}; + foreach my $w ( keys %all_witnesses ) { + my $back = undef; + if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) { + $back = $1; + } + my @wit_nodes = $graph->node_sequence( $graph->start, + $graph->node( '#END#' ), + $w, $back ); + my @wn_names = map { $_->name() } @wit_nodes; + $paths->{$w} = \@wn_names; } + $DB::single = 1; + my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes(); + $graph->make_positions( \@common_nodes, $paths ); } =item B @@ -276,7 +316,7 @@ sub collate_variants { my( $graph, @readings ) = @_; my $lemma_start = shift @readings; my $lemma_end = shift @readings; - my $detranspose = 1; + my $detranspose = 0; # Start the list of distinct nodes with those nodes in the lemma. my @distinct_nodes; @@ -333,6 +373,7 @@ sub collate_variants { $graph->prior_word( $l, $edgelabel ), $l ); remove_duplicate_edges( $graph, $l, $graph->next_word( $l, $edgelabel ) ); + last if $matched; } } push( @remaining_nodes, [ $w, $var_label ] ) unless $matched; diff --git a/lib/Text/Tradition/Parser/CSV.pm b/lib/Text/Tradition/Parser/CSV.pm index a7e3ea4..a4836aa 100644 --- a/lib/Text/Tradition/Parser/CSV.pm +++ b/lib/Text/Tradition/Parser/CSV.pm @@ -3,7 +3,6 @@ package Text::Tradition::Parser::CSV; use strict; use warnings; use Text::CSV::Simple; -use Text::Tradition::Parser::BaseText qw( merge_base ); =head1 NAME @@ -19,29 +18,16 @@ breaks. =over -=item B +=item B -parse( $graph, 'variants.csv', 'reference.txt' ); +my @apparatus = read( $csv_file ); -Takes an initialized Text::Tradition::Graph object and the relevant -data files; puts the text and its variants onto the graph. +Takes a CSV file; returns a data structure of apparatus entries to be +merged with a base text. =cut -sub parse { - my( $graph, $csv_file, $base_text ) = @_; - - # Parse the CSV file into a list of apparatus entries. - my @app_list = _read_csv( $csv_file ); - # Now put the base text onto the graph, and merge in the - # apparatus entries. - merge_base( $graph, $base_text, @app_list ); -} - -# Takes a CSV file; returns a data structure of apparatus entries to -# be merged with a base text. - -sub _read_csv { +sub read { my( $csv_file ) = @_; my $parser = Text::CSV::Simple->new(); my @fields = qw/ reference text variant type context non_corr non_indep