From: Tara L Andrews <tla@mit.edu>
Date: Mon, 3 Oct 2011 09:18:13 +0000 (+0200)
Subject: allow either file or string to be passed for parsing
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=dfc37e3886452920c1332a045f9102ce29457af1;hp=3bc0cd189b8f6d8182fe009614993805c56deaf6;p=scpubgit%2Fstemmatology.git

allow either file or string to be passed for parsing
---

diff --git a/Tradition.bbprojectd/project.bbprojectdata b/Tradition.bbprojectd/project.bbprojectdata
index f3d8ee0..6dacb92 100644
--- a/Tradition.bbprojectd/project.bbprojectdata
+++ b/Tradition.bbprojectd/project.bbprojectdata
@@ -88,9 +88,9 @@
 				AAAAAAGkAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA
 				AAAAAADKPI0jSCsAAAAILogRbWFrZV90cmFkaXRpb24u
 				cGwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
-				AAAAAAAAAAAAAAAAAAAAACJK7cp/KfxURVhUAAAAAP//
+				AAAAAAAAAAAAAAAAAAAAADA/68quyjNURVhUAAAAAP//
 				//8AAAkgAAAAAAAAAAAAAAAAAAAADHN0ZW1tYXRvbG9n
-				eQAQAAgAAMo8cQMAAAARAAgAAMp/DdwAAAABABAACC6I
+				eQAQAAgAAMo8cQMAAAARAAgAAMqurhMAAAABABAACC6I
 				AAckkwAFBAYAAL8xAAIAQk1hY2ludG9zaCBIRDpVc2Vy
 				czoAdGxhOgBQcm9qZWN0czoAc3RlbW1hdG9sb2d5OgBt
 				YWtlX3RyYWRpdGlvbi5wbAAOACQAEQBtAGEAawBlAF8A
diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm
index 3dc6338..5fa89b7 100644
--- a/lib/Text/Tradition.pm
+++ b/lib/Text/Tradition.pm
@@ -67,32 +67,35 @@ sub BUILD {
         $self->_save_collation( $collation );
 
         # Call the appropriate parser on the given data
-        my @formats = grep { /^(Self|CollateX|CSV|CTE|KUL|TEI|Tabular)$/ } keys( %$init_args );
-        my $format = shift( @formats );
+        my @format_standalone = qw/ Self CollateX CSV CTE TEI Tabular /;
+        my @format_basetext = qw/ KUL /;
+        my $use_base;
+        my $format = $init_args->{'input'};
         unless( $format ) {
             warn "No data given to create a collation; will initialize an empty one";
         }
-        if( $format && $format =~ /^(KUL)$/ && 
-            !exists $init_args->{'base'} ) {
-            warn "Cannot make a collation from $format without a base text";
+        if( $format && !( grep { $_ eq $format } @format_standalone )
+            && !( grep { $_ eq $format } @format_basetext ) ) {
+            warn "Unrecognized input format $format; not parsing";
             return;
         }
+        if( $format && grep { $_ eq $format } @format_basetext ) {
+            $use_base = 1;
+            if( !exists $init_args->{'base'} ) {
+                warn "Cannot make a collation from $format without a base text";
+                return;
+            }
+        }
 
         # Now do the parsing. 
-        my @sigla;
         if( $format ) {
-            my @parseargs;
-            if( $format =~ /^(KUL)$/ ) {
-                $init_args->{'data'} = $init_args->{$format};
-                $init_args->{'format'} = $format;
-                $format = 'BaseText';
-                @parseargs = %$init_args;
-            } else {
-                @parseargs = ( $init_args->{ $format } ); 
+            if( $use_base ) { 
+                $format = 'BaseText';   # Use the BaseText module for parsing,
+                                        # but retain the original input arg.
             }
             my $mod = "Text::Tradition::Parser::$format";
             load( $mod );
-            $mod->can('parse')->( $self, @parseargs );
+            $mod->can('parse')->( $self, $init_args );
         }
     }
 }
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index ed2a447..e0e6ec2 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -41,12 +41,12 @@ Takes an initialized graph and a set of options, which must include:
 =cut
 
 sub parse {
-    my( $tradition, %opts ) = @_;
+    my( $tradition, $opts ) = @_;
 
-    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
+    my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
     load( $format_mod );
-    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
-    merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
+    my @apparatus_entries = $format_mod->can('read')->( $opts->{'file'} );
+    merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
 }
 
 =item B<merge_base>
diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm
index 954a84a..e16a9c7 100644
--- a/lib/Text/Tradition/Parser/CTE.pm
+++ b/lib/Text/Tradition/Parser/CTE.pm
@@ -33,12 +33,20 @@ my %sigil_for;  # Save the XML IDs for witnesses.
 my %apps;       # Save the apparatus XML for a given ID.    
 
 sub parse {
-	my( $tradition, $xml_str ) = @_;
+	my( $tradition, $opts ) = @_;
 	my $c = $tradition->collation;	# Some shorthand
 	
 	# First, parse the XML.
 	my $parser = XML::LibXML->new();
-	my $doc = $parser->parse_string( $xml_str );
+    my $doc;
+    if( exists $opts->{'string'} ) {
+        $doc = $parser->parse_string( $opts->{'string'} );
+    } elsif ( exists $opts->{'file'} ) {
+        $doc = $parser->parse_file( $opts->{'file'} );
+    } else {
+        warn "Could not find string or file option to parse";
+        return;
+    }
 	my $tei = $doc->documentElement();
 	my $xpc = XML::LibXML::XPathContext->new( $tei );
 
diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm
index 2ea2cad..2ab0546 100644
--- a/lib/Text/Tradition/Parser/CollateX.pm
+++ b/lib/Text/Tradition/Parser/CollateX.pm
@@ -21,11 +21,11 @@ http://gregor.middell.net/collatex/
 
 =item B<parse>
 
-parse( $graph, $graphml_string );
+parse( $tradition, $init_options );
 
-Takes an initialized Text::Tradition::Graph object and a string
-containing the GraphML; creates the appropriate nodes and edges on the
-graph.
+Takes an initialized Text::Tradition::Graph object and its initialization
+options, including the data source; creates the appropriate nodes and edges 
+on the graph.
 
 =cut
 
@@ -34,8 +34,8 @@ my $CONTENTKEY = 'token';
 my $TRANSKEY = 'identical';
 
 sub parse {
-    my( $tradition, $graphml_str ) = @_;
-    my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
+    my( $tradition, $opts ) = @_;
+    my $graph_data = Text::Tradition::Parser::GraphML::parse( $opts );
     my $collation = $tradition->collation;
     my %witnesses; # Keep track of the witnesses we encounter as we
                    # run through the graph data.
diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm
index 27af6af..c29f78a 100644
--- a/lib/Text/Tradition/Parser/GraphML.pm
+++ b/lib/Text/Tradition/Parser/GraphML.pm
@@ -23,9 +23,10 @@ GraphML conventions of the source program.
 
 =item B<parse>
 
-parse( $graphml_string );
+parse( $init_opts );
 
-Takes a string containing the GraphML; returns a list of nodes, edges,
+Takes a set of Tradition initialization options, among which should be either
+'file' or 'string'; parses that file or string and returns a list of nodes, edges,
 and their associated data.
 
 =cut
@@ -36,13 +37,22 @@ use vars qw/ $xpc $graphattr $nodedata $witnesses /;
 #              -> edgeid -> { source, target, wit1/val1, wit2/val2 ...}
 
 sub parse {
-    my( $graphml_str ) = @_;
+    my( $opts ) = @_;
 
     my $graph_hash = { 'nodes' => [],
                        'edges' => [] };
 
     my $parser = XML::LibXML->new();
-    my $doc = $parser->parse_string( $graphml_str );
+    my $doc;
+    if( exists $opts->{'string'} ) {
+        $doc = $parser->parse_string( $opts->{'string'} );
+    } elsif ( exists $opts->{'file'} ) {
+        $doc = $parser->parse_file( $opts->{'file'} );
+    } else {
+        warn "Could not find string or file option to parse";
+        return;
+    }
+    
     my $graphml = $doc->documentElement();
     $xpc = XML::LibXML::XPathContext->new( $graphml );
     $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' );
diff --git a/lib/Text/Tradition/Parser/KUL.pm b/lib/Text/Tradition/Parser/KUL.pm
index 05c3a12..0469be6 100644
--- a/lib/Text/Tradition/Parser/KUL.pm
+++ b/lib/Text/Tradition/Parser/KUL.pm
@@ -3,7 +3,7 @@ package Text::Tradition::Parser::KUL;
 use strict;
 use warnings;
 use Storable qw /dclone/;
-use Text::CSV::Simple;
+use Text::CSV::Simple;  # TODO convert to CSV_XS
 
 =head1 NAME
 
@@ -29,11 +29,11 @@ merged with a base text.
 =cut
 
 sub read {
-    my( $csv_file ) = @_;
+    my( $opts ) = @_;
     my $parser = Text::CSV::Simple->new();
     my @fields = qw/ reference text variant type context non_corr non_indep 
                      length total origin /;
-    my @lines = $parser->read_file( $ARGV[0] );
+    my @lines = $parser->read_file( $opts->{'file'} );
     my @labels = @{shift( @lines )};
     push( @fields, @labels[10..$#labels] );
 
diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm
index ce49448..0c4b83f 100644
--- a/lib/Text/Tradition/Parser/Self.pm
+++ b/lib/Text/Tradition/Parser/Self.pm
@@ -34,10 +34,10 @@ my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
     	  source target witness extra relationship/;
 
 sub parse {
-    my( $tradition, $graphml_str ) = @_;
+    my( $tradition, $opts ) = @_;
+    my $graph_data = Text::Tradition::Parser::GraphML::parse( $opts );
     
     # TODO this is begging for stream parsing instead of multiple loops.
-    my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
 
     my $collation = $tradition->collation;
     my %witnesses;
diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm
index 23876ab..7b68ba2 100644
--- a/lib/Text/Tradition/Parser/TEI.pm
+++ b/lib/Text/Tradition/Parser/TEI.pm
@@ -57,11 +57,19 @@ sub make_tagnames {
 
 # Parse the TEI file.
 sub parse {
-    my( $tradition, $xml_str ) = @_;
+    my( $tradition, $opts ) = @_;
     
     # First, parse the XML.
     my $parser = XML::LibXML->new();
-    my $doc = $parser->parse_string( $xml_str );
+    my $doc;
+    if( exists $opts->{'string'} ) {
+        $doc = $parser->parse_string( $opts->{'string'} );
+    } elsif ( exists $opts->{'file'} ) {
+        $doc = $parser->parse_file( $opts->{'file'} );
+    } else {
+        warn "Could not find string or file option to parse";
+        return;
+    }
     my $tei = $doc->documentElement();
     my $xpc = XML::LibXML::XPathContext->new( $tei );
     my $ns;
diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm
index 199490b..2cf17d0 100644
--- a/lib/Text/Tradition/Parser/Tabular.pm
+++ b/lib/Text/Tradition/Parser/Tabular.pm
@@ -27,22 +27,36 @@ graph.
 =cut
 
 sub parse {
-    my( $tradition, $tab_str ) = @_;
-    # TODO Allow setting of sep_char
+    my( $tradition, $opts ) = @_;
     my $c = $tradition->collation; # shorthand
-    my $csv = Text::CSV_XS->new( { binary => 1, # binary for UTF-8
-        sep_char => "\t" } );
-    my @lines = split( "\n", $tab_str );
+    my $csv = Text::CSV_XS->new( { 
+        binary => 1, # binary for UTF-8
+        sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } 
+        );
+    # TODO Handle being given a file
+    
     my $alignment_table;
-    foreach my $l ( @lines ) {
-        my $status = $csv->parse( $l );
-        if( $status ) {
-            push( @$alignment_table, [ $csv->fields ] );
-        } else {
-            warn "Could not parse line $l: " . $csv->error_input;
+    if( exists $opts->{'string' } ) {
+        my @lines = split( "\n", $opts->{'string'} );
+        foreach my $l ( @lines ) {
+            my $status = $csv->parse( $l );
+            if( $status ) {
+                push( @$alignment_table, [ $csv->fields ] );
+            } else {
+                warn "Could not parse line $l: " . $csv->error_input;
+            }
         }
+    } elsif( exists $opts->{'file'} ) {
+        open( my $fh, $opts->{'file'} ) or die "Could not open input file " . $opts->{'file'};
+        while( my $row = $csv->getline( $fh ) ) {
+            push( @$alignment_table, $row );
+        }
+        close $fh;
+    } else {
+        warn "Could not find string or file option to parse";
+        return;
     }
-    
+
     # Set up the witnesses we find in the first line
     my @witnesses;
     foreach my $sigil ( @{$alignment_table->[0]} ) {
diff --git a/make_tradition.pl b/make_tradition.pl
index 22b50df..41c575f 100755
--- a/make_tradition.pl
+++ b/make_tradition.pl
@@ -45,20 +45,12 @@ if( $informat eq 'KUL' && !$inbase ) {
     help( "$informat input needs a base text" );
 }
 
-# CSV parsing requires a filename; XML parsing requires a string.
 my $input = $ARGV[0];
-unless( $informat eq 'KUL' || $informat eq 'CSV' ) {
-    my @lines;
-    open( INFILE, "$input" ) or die "Could not read $input";
-    binmode INFILE, ':utf8';
-    @lines = <INFILE>;
-    close INFILE;
-    $input = join( '', @lines );
-}
 
 # First: read the base. Make a graph, but also note which
 # nodes represent line beginnings.
-my %args = ( $informat => $input,
+my %args = ( 'input' => $informat,
+             'file' => $input,
              'linear' => $linear );
 $args{'base'} = $inbase if $inbase;
 my $tradition = Text::Tradition->new( %args );
diff --git a/t/01app.t b/t/01app.t
index 77eadc8..9f512b4 100644
--- a/t/01app.t
+++ b/t/01app.t
@@ -1,8 +1,26 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
-use Test::More tests => 2;
+use lib 'lib';
+use Test::More;
+use Text::Tradition;
 
-BEGIN { use_ok 'Catalyst::Test', 'lemmatizer' }
+BEGIN { use_ok 'Text::Tradition' }
 
-ok( request('/')->is_success, 'Request should succeed' );
+# A simple test, just to make sure we can parse a graph.
+my $datafile = 't/data/florilegium_graphml.xml';
+my $tradition = Text::Tradition->new( 'input' => 'TEI',
+                                      'name' => 'test0',
+                                      'file' => $datafile,
+                                      'linear' => 1 );
+
+ok( $tradition, "Got a tradition object" );
+is( scalar $tradition->witnesses, 13, "Found all witnesses" );
+ok( $tradition->collation, "Tradition has a collation" );
+
+my $c = $tradition->collation;
+is( scalar $c->readings, 236, "Collation has all readings" );
+is( scalar $c->paths, 1838, "Collation has all paths" );
+is( scalar $c->relationships, 0, "Collation has all relationships" );
+
+done_testing;
\ No newline at end of file