From: Tara L Andrews Date: Mon, 3 Oct 2011 09:18:13 +0000 (+0200) Subject: allow either file or string to be passed for parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=dfc37e3886452920c1332a045f9102ce29457af1;p=scpubgit%2Fstemmatology.git allow either file or string to be passed for parsing --- diff --git a/Tradition.bbprojectd/project.bbprojectdata b/Tradition.bbprojectd/project.bbprojectdata index f3d8ee0..6dacb92 100644 --- a/Tradition.bbprojectd/project.bbprojectdata +++ b/Tradition.bbprojectd/project.bbprojectdata @@ -88,9 +88,9 @@ AAAAAAGkAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA AAAAAADKPI0jSCsAAAAILogRbWFrZV90cmFkaXRpb24u cGwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAACJK7cp/KfxURVhUAAAAAP// + AAAAAAAAAAAAAAAAAAAAADA/68quyjNURVhUAAAAAP// //8AAAkgAAAAAAAAAAAAAAAAAAAADHN0ZW1tYXRvbG9n - eQAQAAgAAMo8cQMAAAARAAgAAMp/DdwAAAABABAACC6I + eQAQAAgAAMo8cQMAAAARAAgAAMqurhMAAAABABAACC6I AAckkwAFBAYAAL8xAAIAQk1hY2ludG9zaCBIRDpVc2Vy czoAdGxhOgBQcm9qZWN0czoAc3RlbW1hdG9sb2d5OgBt YWtlX3RyYWRpdGlvbi5wbAAOACQAEQBtAGEAawBlAF8A diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 3dc6338..5fa89b7 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -67,32 +67,35 @@ sub BUILD { $self->_save_collation( $collation ); # Call the appropriate parser on the given data - my @formats = grep { /^(Self|CollateX|CSV|CTE|KUL|TEI|Tabular)$/ } keys( %$init_args ); - my $format = shift( @formats ); + my @format_standalone = qw/ Self CollateX CSV CTE TEI Tabular /; + my @format_basetext = qw/ KUL /; + my $use_base; + my $format = $init_args->{'input'}; unless( $format ) { warn "No data given to create a collation; will initialize an empty one"; } - if( $format && $format =~ /^(KUL)$/ && - !exists $init_args->{'base'} ) { - warn "Cannot make a collation from $format without a base text"; + if( $format && !( grep { $_ eq $format } @format_standalone ) + && !( grep { $_ eq $format } @format_basetext ) ) { + warn "Unrecognized input format $format; not parsing"; return; } + if( $format && grep { $_ eq $format } @format_basetext ) { + $use_base = 1; + if( !exists $init_args->{'base'} ) { + warn "Cannot make a collation from $format without a base text"; + return; + } + } # Now do the parsing. - my @sigla; if( $format ) { - my @parseargs; - if( $format =~ /^(KUL)$/ ) { - $init_args->{'data'} = $init_args->{$format}; - $init_args->{'format'} = $format; - $format = 'BaseText'; - @parseargs = %$init_args; - } else { - @parseargs = ( $init_args->{ $format } ); + if( $use_base ) { + $format = 'BaseText'; # Use the BaseText module for parsing, + # but retain the original input arg. } my $mod = "Text::Tradition::Parser::$format"; load( $mod ); - $mod->can('parse')->( $self, @parseargs ); + $mod->can('parse')->( $self, $init_args ); } } } diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index ed2a447..e0e6ec2 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -41,12 +41,12 @@ Takes an initialized graph and a set of options, which must include: =cut sub parse { - my( $tradition, %opts ) = @_; + my( $tradition, $opts ) = @_; - my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; + my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'}; load( $format_mod ); - my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); - merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries ); + my @apparatus_entries = $format_mod->can('read')->( $opts->{'file'} ); + merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries ); } =item B diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm index 954a84a..e16a9c7 100644 --- a/lib/Text/Tradition/Parser/CTE.pm +++ b/lib/Text/Tradition/Parser/CTE.pm @@ -33,12 +33,20 @@ my %sigil_for; # Save the XML IDs for witnesses. my %apps; # Save the apparatus XML for a given ID. sub parse { - my( $tradition, $xml_str ) = @_; + my( $tradition, $opts ) = @_; my $c = $tradition->collation; # Some shorthand # First, parse the XML. my $parser = XML::LibXML->new(); - my $doc = $parser->parse_string( $xml_str ); + my $doc; + if( exists $opts->{'string'} ) { + $doc = $parser->parse_string( $opts->{'string'} ); + } elsif ( exists $opts->{'file'} ) { + $doc = $parser->parse_file( $opts->{'file'} ); + } else { + warn "Could not find string or file option to parse"; + return; + } my $tei = $doc->documentElement(); my $xpc = XML::LibXML::XPathContext->new( $tei ); diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 2ea2cad..2ab0546 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -21,11 +21,11 @@ http://gregor.middell.net/collatex/ =item B -parse( $graph, $graphml_string ); +parse( $tradition, $init_options ); -Takes an initialized Text::Tradition::Graph object and a string -containing the GraphML; creates the appropriate nodes and edges on the -graph. +Takes an initialized Text::Tradition::Graph object and its initialization +options, including the data source; creates the appropriate nodes and edges +on the graph. =cut @@ -34,8 +34,8 @@ my $CONTENTKEY = 'token'; my $TRANSKEY = 'identical'; sub parse { - my( $tradition, $graphml_str ) = @_; - my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); + my( $tradition, $opts ) = @_; + my $graph_data = Text::Tradition::Parser::GraphML::parse( $opts ); my $collation = $tradition->collation; my %witnesses; # Keep track of the witnesses we encounter as we # run through the graph data. diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index 27af6af..c29f78a 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -23,9 +23,10 @@ GraphML conventions of the source program. =item B -parse( $graphml_string ); +parse( $init_opts ); -Takes a string containing the GraphML; returns a list of nodes, edges, +Takes a set of Tradition initialization options, among which should be either +'file' or 'string'; parses that file or string and returns a list of nodes, edges, and their associated data. =cut @@ -36,13 +37,22 @@ use vars qw/ $xpc $graphattr $nodedata $witnesses /; # -> edgeid -> { source, target, wit1/val1, wit2/val2 ...} sub parse { - my( $graphml_str ) = @_; + my( $opts ) = @_; my $graph_hash = { 'nodes' => [], 'edges' => [] }; my $parser = XML::LibXML->new(); - my $doc = $parser->parse_string( $graphml_str ); + my $doc; + if( exists $opts->{'string'} ) { + $doc = $parser->parse_string( $opts->{'string'} ); + } elsif ( exists $opts->{'file'} ) { + $doc = $parser->parse_file( $opts->{'file'} ); + } else { + warn "Could not find string or file option to parse"; + return; + } + my $graphml = $doc->documentElement(); $xpc = XML::LibXML::XPathContext->new( $graphml ); $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' ); diff --git a/lib/Text/Tradition/Parser/KUL.pm b/lib/Text/Tradition/Parser/KUL.pm index 05c3a12..0469be6 100644 --- a/lib/Text/Tradition/Parser/KUL.pm +++ b/lib/Text/Tradition/Parser/KUL.pm @@ -3,7 +3,7 @@ package Text::Tradition::Parser::KUL; use strict; use warnings; use Storable qw /dclone/; -use Text::CSV::Simple; +use Text::CSV::Simple; # TODO convert to CSV_XS =head1 NAME @@ -29,11 +29,11 @@ merged with a base text. =cut sub read { - my( $csv_file ) = @_; + my( $opts ) = @_; my $parser = Text::CSV::Simple->new(); my @fields = qw/ reference text variant type context non_corr non_indep length total origin /; - my @lines = $parser->read_file( $ARGV[0] ); + my @lines = $parser->read_file( $opts->{'file'} ); my @labels = @{shift( @lines )}; push( @fields, @labels[10..$#labels] ); diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index ce49448..0c4b83f 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -34,10 +34,10 @@ my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY, source target witness extra relationship/; sub parse { - my( $tradition, $graphml_str ) = @_; + my( $tradition, $opts ) = @_; + my $graph_data = Text::Tradition::Parser::GraphML::parse( $opts ); # TODO this is begging for stream parsing instead of multiple loops. - my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); my $collation = $tradition->collation; my %witnesses; diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 23876ab..7b68ba2 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -57,11 +57,19 @@ sub make_tagnames { # Parse the TEI file. sub parse { - my( $tradition, $xml_str ) = @_; + my( $tradition, $opts ) = @_; # First, parse the XML. my $parser = XML::LibXML->new(); - my $doc = $parser->parse_string( $xml_str ); + my $doc; + if( exists $opts->{'string'} ) { + $doc = $parser->parse_string( $opts->{'string'} ); + } elsif ( exists $opts->{'file'} ) { + $doc = $parser->parse_file( $opts->{'file'} ); + } else { + warn "Could not find string or file option to parse"; + return; + } my $tei = $doc->documentElement(); my $xpc = XML::LibXML::XPathContext->new( $tei ); my $ns; diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index 199490b..2cf17d0 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -27,22 +27,36 @@ graph. =cut sub parse { - my( $tradition, $tab_str ) = @_; - # TODO Allow setting of sep_char + my( $tradition, $opts ) = @_; my $c = $tradition->collation; # shorthand - my $csv = Text::CSV_XS->new( { binary => 1, # binary for UTF-8 - sep_char => "\t" } ); - my @lines = split( "\n", $tab_str ); + my $csv = Text::CSV_XS->new( { + binary => 1, # binary for UTF-8 + sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } + ); + # TODO Handle being given a file + my $alignment_table; - foreach my $l ( @lines ) { - my $status = $csv->parse( $l ); - if( $status ) { - push( @$alignment_table, [ $csv->fields ] ); - } else { - warn "Could not parse line $l: " . $csv->error_input; + if( exists $opts->{'string' } ) { + my @lines = split( "\n", $opts->{'string'} ); + foreach my $l ( @lines ) { + my $status = $csv->parse( $l ); + if( $status ) { + push( @$alignment_table, [ $csv->fields ] ); + } else { + warn "Could not parse line $l: " . $csv->error_input; + } } + } elsif( exists $opts->{'file'} ) { + open( my $fh, $opts->{'file'} ) or die "Could not open input file " . $opts->{'file'}; + while( my $row = $csv->getline( $fh ) ) { + push( @$alignment_table, $row ); + } + close $fh; + } else { + warn "Could not find string or file option to parse"; + return; } - + # Set up the witnesses we find in the first line my @witnesses; foreach my $sigil ( @{$alignment_table->[0]} ) { diff --git a/make_tradition.pl b/make_tradition.pl index 22b50df..41c575f 100755 --- a/make_tradition.pl +++ b/make_tradition.pl @@ -45,20 +45,12 @@ if( $informat eq 'KUL' && !$inbase ) { help( "$informat input needs a base text" ); } -# CSV parsing requires a filename; XML parsing requires a string. my $input = $ARGV[0]; -unless( $informat eq 'KUL' || $informat eq 'CSV' ) { - my @lines; - open( INFILE, "$input" ) or die "Could not read $input"; - binmode INFILE, ':utf8'; - @lines = ; - close INFILE; - $input = join( '', @lines ); -} # First: read the base. Make a graph, but also note which # nodes represent line beginnings. -my %args = ( $informat => $input, +my %args = ( 'input' => $informat, + 'file' => $input, 'linear' => $linear ); $args{'base'} = $inbase if $inbase; my $tradition = Text::Tradition->new( %args ); diff --git a/t/01app.t b/t/01app.t index 77eadc8..9f512b4 100644 --- a/t/01app.t +++ b/t/01app.t @@ -1,8 +1,26 @@ #!/usr/bin/env perl use strict; use warnings; -use Test::More tests => 2; +use lib 'lib'; +use Test::More; +use Text::Tradition; -BEGIN { use_ok 'Catalyst::Test', 'lemmatizer' } +BEGIN { use_ok 'Text::Tradition' } -ok( request('/')->is_success, 'Request should succeed' ); +# A simple test, just to make sure we can parse a graph. +my $datafile = 't/data/florilegium_graphml.xml'; +my $tradition = Text::Tradition->new( 'input' => 'TEI', + 'name' => 'test0', + 'file' => $datafile, + 'linear' => 1 ); + +ok( $tradition, "Got a tradition object" ); +is( scalar $tradition->witnesses, 13, "Found all witnesses" ); +ok( $tradition->collation, "Tradition has a collation" ); + +my $c = $tradition->collation; +is( scalar $c->readings, 236, "Collation has all readings" ); +is( scalar $c->paths, 1838, "Collation has all paths" ); +is( scalar $c->relationships, 0, "Collation has all relationships" ); + +done_testing; \ No newline at end of file