allow either file or string to be passed for parsing
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Tabular.pm
CommitLineData
d9e873d0 1package Text::Tradition::Parser::Tabular;
2
3use strict;
4use warnings;
5use Text::CSV_XS;
6
7=head1 NAME
8
9Text::Tradition::Parser::Tabular
10
11=head1 DESCRIPTION
12
13Parser module for Text::Tradition to read an alignment table format, such as CSV.
14
15=head1 METHODS
16
17=over
18
19=item B<parse>
20
21parse( $graph, $graphml_string );
22
23Takes an initialized Text::Tradition::Graph object and a string
24containing the GraphML; creates the appropriate nodes and edges on the
25graph.
26
27=cut
28
29sub parse {
dfc37e38 30 my( $tradition, $opts ) = @_;
d9e873d0 31 my $c = $tradition->collation; # shorthand
dfc37e38 32 my $csv = Text::CSV_XS->new( {
33 binary => 1, # binary for UTF-8
34 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" }
35 );
36 # TODO Handle being given a file
37
d9e873d0 38 my $alignment_table;
dfc37e38 39 if( exists $opts->{'string' } ) {
40 my @lines = split( "\n", $opts->{'string'} );
41 foreach my $l ( @lines ) {
42 my $status = $csv->parse( $l );
43 if( $status ) {
44 push( @$alignment_table, [ $csv->fields ] );
45 } else {
46 warn "Could not parse line $l: " . $csv->error_input;
47 }
48 }
49 } elsif( exists $opts->{'file'} ) {
50 open( my $fh, $opts->{'file'} ) or die "Could not open input file " . $opts->{'file'};
51 while( my $row = $csv->getline( $fh ) ) {
52 push( @$alignment_table, $row );
d9e873d0 53 }
dfc37e38 54 close $fh;
55 } else {
56 warn "Could not find string or file option to parse";
57 return;
d9e873d0 58 }
dfc37e38 59
d9e873d0 60 # Set up the witnesses we find in the first line
61 my @witnesses;
62 foreach my $sigil ( @{$alignment_table->[0]} ) {
63 my $wit = $tradition->add_witness( 'sigil' => $sigil );
64 $wit->path( [ $c->start ] );
65 push( @witnesses, $wit );
66 }
67
68 # Now for the next rows, make nodes as necessary, assign their ranks, and
69 # add them to the witness paths.
d9e873d0 70 foreach my $idx ( 1 .. $#{$alignment_table} ) {
71 my $row = $alignment_table->[$idx];
72 my $nodes = make_nodes( $c, $row, $idx );
73 foreach my $w ( 0 .. $#{$row} ) {
74 # push the appropriate node onto the appropriate witness path
75 my $word = $row->[$w];
76 if( $word ) {
77 my $reading = $nodes->{$word};
78 my $wit = $witnesses[$w];
79 push( @{$wit->path}, $reading );
80 } # else skip it for empty readings.
81 }
82 }
83
eca16057 84
85 # Collapse our lacunae into a single node and
86 # push the end node onto all paths.
d9e873d0 87 $c->end->rank( scalar @$alignment_table );
88 foreach my $wit ( @witnesses ) {
eca16057 89 my $p = $wit->path;
90 my $last_rdg = shift @$p;
91 my $new_p = [ $last_rdg ];
92 foreach my $rdg ( @$p ) {
93 if( $rdg->text eq '#LACUNA#' ) {
94 # If we are in a lacuna already, drop this node.
95 # Otherwise make a lacuna node and drop this node.
96 unless( $last_rdg->is_lacuna ) {
97 my $l = $c->add_lacuna( $rdg->name );
98 $l->rank( $rdg->rank );
99 push( @$new_p, $l );
100 $last_rdg = $l;
101 }
102 $c->del_reading( $rdg );
103 } else {
104 # No lacuna, save the reading.
105 push( @$new_p, $rdg );
106 }
107 }
108 push( @$new_p, $c->end );
109 $wit->path( $new_p );
d9e873d0 110 }
111
112 # Join up the paths.
113 $c->make_witness_paths;
d9e873d0 114}
115
116sub make_nodes {
117 my( $collation, $row, $index ) = @_;
118 my %unique;
119 foreach my $w ( @$row ) {
120 $unique{$w} = 1 if $w;
121 }
122 my $ctr = 1;
123 foreach my $w ( keys %unique ) {
124 my $r = $collation->add_reading( "$index,$ctr" );
125 $ctr++;
126 $r->rank( $index );
127 $r->text( $w );
128 $unique{$w} = $r;
129 }
130 return \%unique;
131}
132
1331;