From: Tara L Andrews Date: Fri, 2 Sep 2011 07:32:11 +0000 (+0200) Subject: add support for alignment table input X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=d9e873d0f90f1b7aec072c16a0eed37878f7f47f;p=scpubgit%2Fstemmatology.git add support for alignment table input --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 601a5d6..5c3f858 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -67,7 +67,7 @@ sub BUILD { $self->_save_collation( $collation ); # Call the appropriate parser on the given data - my @formats = grep { /^(Self|CollateX|CSV|CTE|KUL|TEI)$/ } keys( %$init_args ); + my @formats = grep { /^(Self|CollateX|CSV|CTE|KUL|TEI|Tabular)$/ } keys( %$init_args ); my $format = shift( @formats ); unless( $format ) { warn "No data given to create a collation; will initialize an empty one"; diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index e78fe3c..7d48cfc 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -6,7 +6,6 @@ use Graph::Easy; use IPC::Run qw( run binary ); use Text::CSV_XS; use Text::Tradition::Collation::Path; -use Text::Tradition::Collation::Position; use Text::Tradition::Collation::Reading; use Text::Tradition::Collation::Relationship; use Text::Tradition::Collation::Segment; @@ -296,6 +295,8 @@ sub as_svg { my @cmd = qw/dot -Tsvg/; my( $svg, $err ); my $dotfile = File::Temp->new(); + ## TODO REMOVE + $dotfile->unlink_on_destroy(0); binmode $dotfile, ':utf8'; print $dotfile $self->as_dot(); push( @cmd, $dotfile->filename ); @@ -384,7 +385,7 @@ sub as_graphml { # Add the data keys for nodes my %node_data_keys; my $ndi = 0; - foreach my $datum ( qw/ name reading identical position class / ) { + foreach my $datum ( qw/ name reading identical rank class / ) { $node_data_keys{$datum} = 'dn'.$ndi++; my $key = $root->addNewChild( $graphml_ns, 'key' ); $key->setAttribute( 'attr.name', $datum ); @@ -425,8 +426,8 @@ sub as_graphml { $node_el->setAttribute( 'id', $node_xmlid ); _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label ); - _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference ) - if $n->has_position; + _add_graphml_data( $node_el, $node_data_keys{'rank'}, $n->rank ) + if $n->has_rank; _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name ) if $n->has_primary; @@ -649,8 +650,8 @@ sub start { $self->graph->rename_node( $new_start, '#START#' ); } # Make sure the start node has a start position. - unless( $self->reading( '#START#' )->has_position ) { - $self->reading( '#START#' )->position( '0,0' ); + unless( $self->reading( '#START#' )->has_rank ) { + $self->reading( '#START#' )->rank( '0' ); } return $self->reading('#START#'); } @@ -875,12 +876,14 @@ sub make_witness_path { foreach my $idx ( 0 .. $#chain-1 ) { $self->add_path( $chain[$idx], $chain[$idx+1], $sig ); } - @chain = @{$wit->uncorrected_path}; - foreach my $idx( 0 .. $#chain-1 ) { - my $source = $chain[$idx]; - my $target = $chain[$idx+1]; - $self->add_path( $source, $target, $sig.$self->ac_label ) - unless $self->has_path( $source, $target, $sig ); + if( $wit->has_ante_corr ) { + @chain = @{$wit->uncorrected_path}; + foreach my $idx( 0 .. $#chain-1 ) { + my $source = $chain[$idx]; + my $target = $chain[$idx+1]; + $self->add_path( $source, $target, $sig.$self->ac_label ) + unless $self->has_path( $source, $target, $sig ); + } } } @@ -976,9 +979,6 @@ sub possible_positions { # TODO think about indexing this. sub readings_at_position { my( $self, $position, $strict ) = @_; - unless( ref( $position ) eq 'Text::Tradition::Collation::Position' ) { - $position = Text::Tradition::Collation::Position->new( $position ); - } my @answer; foreach my $r ( $self->readings ) { push( @answer, $r ) if $r->is_at_position( $position, $strict ); @@ -1003,7 +1003,7 @@ sub init_lemmata { sub common_readings { my $self = shift; my @common = grep { $_->is_common } $self->readings(); - return sort { $a->position->cmp_with( $b->position ) } @common; + return sort { $a->rank <=> $b->rank } @common; } =item B diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 5526d3a..2ea2cad 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -119,11 +119,13 @@ sub parse { $collation->end( $gnode ); } } + + # TODO Need to populate $wit->path / uncorrected_path # Now we have added the witnesses and their paths, so we can # calculate their explicit positions. # TODO CollateX does this, and we should just have it exported there. - $collation->calculate_positions(); + $collation->calculate_ranks(); } =back diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 8c5c391..5311660 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -27,8 +27,8 @@ graph. =cut -my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY, $CLASS_KEY ) - = qw/ name reading identical position class /; +my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY ) + = qw/ name reading identical rank class /; sub parse { my( $tradition, $graphml_str ) = @_; @@ -114,21 +114,15 @@ sub parse { # We evidently have a linear graph. $linear = 1; $this_reading->set_identical( $other_reading ); - } elsif ( $edkey eq $POSITION_KEY ) { - $this_reading->position( $extra_data->{$nkey}->{$edkey} ); + } elsif ( $edkey eq $RANK_KEY ) { + $this_reading->rank( $extra_data->{$nkey}->{$edkey} ); } else { warn "Unfamiliar reading node data $edkey for $nkey"; } } } $collation->linear( $linear ); - - # We know what the beginning and ending nodes are, no need to - # search or reset. - my $end_node = $collation->reading( '#END#' ); - # Walk the paths and make reading sequences for our witnesses. - # No need to calculate positions as we have them already. - $collation->walk_witness_paths( $end_node ); + # TODO We probably need to set the $witness->path arrays for each wit. } =back diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm new file mode 100644 index 0000000..10d0730 --- /dev/null +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -0,0 +1,104 @@ +package Text::Tradition::Parser::Tabular; + +use strict; +use warnings; +use Text::CSV_XS; + +=head1 NAME + +Text::Tradition::Parser::Tabular + +=head1 DESCRIPTION + +Parser module for Text::Tradition to read an alignment table format, such as CSV. + +=head1 METHODS + +=over + +=item B + +parse( $graph, $graphml_string ); + +Takes an initialized Text::Tradition::Graph object and a string +containing the GraphML; creates the appropriate nodes and edges on the +graph. + +=cut + +sub parse { + my( $tradition, $tab_str ) = @_; + # TODO Allow setting of sep_char + my $c = $tradition->collation; # shorthand + my $csv = Text::CSV_XS->new( { binary => 1 } ); # binary for UTF-8 + my @lines = split( "\n", $tab_str ); + # Conveniently, we are basically receiving exactly the sort of alignment table + # we might want to produce later. May as well save it. + my $alignment_table; + foreach my $l ( @lines ) { + my $status = $csv->parse( $l ); + if( $status ) { + push( @$alignment_table, [ $csv->fields ] ); + } else { + warn "Could not parse line $l: " . $csv->error_input; + } + } + + # Set up the witnesses we find in the first line + my @witnesses; + foreach my $sigil ( @{$alignment_table->[0]} ) { + my $wit = $tradition->add_witness( 'sigil' => $sigil ); + $wit->path( [ $c->start ] ); + push( @witnesses, $wit ); + } + + # Now for the next rows, make nodes as necessary, assign their ranks, and + # add them to the witness paths. + $DB::single = 1; + foreach my $idx ( 1 .. $#{$alignment_table} ) { + my $row = $alignment_table->[$idx]; + my $nodes = make_nodes( $c, $row, $idx ); + foreach my $w ( 0 .. $#{$row} ) { + # push the appropriate node onto the appropriate witness path + my $word = $row->[$w]; + if( $word ) { + my $reading = $nodes->{$word}; + my $wit = $witnesses[$w]; + push( @{$wit->path}, $reading ); + } # else skip it for empty readings. + } + } + + # Push the end node onto all paths. + $c->end->rank( scalar @$alignment_table ); + foreach my $wit ( @witnesses ) { + push( @{$wit->path}, $c->end ); + } + + # Join up the paths. + $c->make_witness_paths; + + # Save the alignment table that was so handily provided to us. + # TODO if we support other delimiters, we will have to re-export this + # rather than saving the original string. + $c->_save_csv( $tab_str ); +} + +sub make_nodes { + my( $collation, $row, $index ) = @_; + my %unique; + foreach my $w ( @$row ) { + $unique{$w} = 1 if $w; + } + my $ctr = 1; + foreach my $w ( keys %unique ) { + my $r = $collation->add_reading( "$index,$ctr" ); + $ctr++; + $r->rank( $index ); + $r->text( $w ); + $unique{$w} = $r; + } + return \%unique; +} + +1; \ No newline at end of file diff --git a/make_tradition.pl b/make_tradition.pl index 14acdef..6828e71 100644 --- a/make_tradition.pl +++ b/make_tradition.pl @@ -26,7 +26,7 @@ if( $help ) { help(); } -unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX)$/i ) { +unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX|tab(ular)?)$/i ) { help( "Input format must be one of CollateX, CSV, CTE, Self, TEI" ); } $informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i; @@ -34,6 +34,7 @@ $informat = 'KUL' if $informat =~ /^kul$/i; $informat = 'CTE' if $informat =~ /^cte$/i; $informat = 'Self' if $informat =~ /^self$/i; $informat = 'TEI' if $informat =~ /^tei$/i; +$informat = 'Tabular' if $informat =~ /^tab$/i; unless( $outformat =~ /^(graphml|svg|dot|stemma|csv)$/ ) { help( "Output format must be one of graphml, svg, csv, stemma, or dot" ); @@ -49,6 +50,7 @@ my $input = $ARGV[0]; unless( $informat eq 'KUL' || $informat eq 'CSV' ) { my @lines; open( INFILE, "$input" ) or die "Could not read $input"; + binmode INFILE, ':utf8'; @lines = ; close INFILE; $input = join( '', @lines );