Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::CSV; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
0e96be5f |
5 | use Storable qw /dclone/; |
b49c4318 |
6 | use Text::CSV::Simple; |
b49c4318 |
7 | |
2ceca8c3 |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::CSV |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a list of variants as a CSV |
15 | file and a reference text as a plaintext file with appropriate line |
16 | breaks. |
17 | |
18 | =head1 METHODS |
19 | |
20 | =over |
21 | |
52ce987f |
22 | =item B<read> |
2ceca8c3 |
23 | |
52ce987f |
24 | my @apparatus = read( $csv_file ); |
2ceca8c3 |
25 | |
52ce987f |
26 | Takes a CSV file; returns a data structure of apparatus entries to be |
27 | merged with a base text. |
2ceca8c3 |
28 | |
29 | =cut |
b49c4318 |
30 | |
52ce987f |
31 | sub read { |
b49c4318 |
32 | my( $csv_file ) = @_; |
33 | my $parser = Text::CSV::Simple->new(); |
34 | my @fields = qw/ reference text variant type context non_corr non_indep |
35 | length total origin /; |
36 | my @lines = $parser->read_file( $ARGV[0] ); |
37 | my @labels = @{shift( @lines )}; |
38 | push( @fields, @labels[10..$#labels] ); |
39 | |
40 | my $started = 0; |
41 | my $rdg_ctr = 0; |
42 | my $apparatus = {}; |
43 | my @app_list; |
44 | foreach my $line ( @lines ) { |
45 | my $new_lemma = 0; |
46 | if( $line->[0] =~ /^\d/ ) { |
47 | $new_lemma = $started = 1; |
48 | } |
49 | next unless $started; |
50 | |
51 | # Get the lines into their fields. |
52 | my %linehash; |
53 | @linehash{@fields} = @$line; |
54 | |
55 | # Readings can take up multiple lines in the CSV, so append the |
56 | # apparatus to the list, and clear it out, if we have started a |
57 | # new reading. |
58 | if( $new_lemma ) { |
0e96be5f |
59 | # Was it a doubled-up apparatus entry e.g 'non (1/2)'? |
60 | if( keys %$apparatus && |
61 | $apparatus->{'rdg_0'} =~ /(.*?)\s+\(?([\d\/]+)\)?\s*$/ ) { |
62 | my( $reading, $istr ) = ( $1, $2 ); |
63 | my @instances = split( /\//, $istr ); |
64 | foreach my $i ( @instances ) { |
65 | my $app = dclone( $apparatus ); |
66 | $app->{'rdg_0'} = $reading . "_$i"; |
67 | $app->{'_id'} .= chr(97+$i); |
68 | push( @app_list, $app ); |
69 | } |
70 | } elsif( keys %$apparatus ) { |
71 | push( @app_list, $apparatus ); |
72 | } |
73 | $apparatus = { _id => $linehash{reference} }; |
b49c4318 |
74 | $rdg_ctr = 0; |
75 | } |
76 | # The apparatus has multiple readings, and multiple witnesses per |
77 | # reading. So it's a hashref whose values are listrefs. |
0e96be5f |
78 | $apparatus->{'rdg_0'} = $linehash{'text'} if $linehash{'text'}; |
79 | $apparatus->{'rdg_' . ++$rdg_ctr} = $linehash{'variant'}; |
b49c4318 |
80 | foreach my $attr ( @fields[3..8] ) { |
0e96be5f |
81 | $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{$attr} if $linehash{$attr}; |
b49c4318 |
82 | } |
83 | |
84 | foreach my $k ( @fields[10..$#fields] ) { |
85 | my $variant_rdg = $linehash{$k}; |
86 | $k =~ s/\s+\(a\.c\.\)//; |
87 | if( $variant_rdg =~ /^0/ ) { |
0e96be5f |
88 | $apparatus->{$k} = 'rdg_0' |
89 | unless exists $apparatus->{$k}; |
b49c4318 |
90 | } elsif ( $variant_rdg =~ /^1/ ) { |
0e96be5f |
91 | $apparatus->{$k} = 'rdg_' . $rdg_ctr; |
b49c4318 |
92 | } else { # else for $, we don't list the MS |
93 | warn "Unparsed variant indicator $variant_rdg for $k in " . |
94 | $apparatus->{'_id'} |
0e96be5f |
95 | unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ ); |
b49c4318 |
96 | } |
97 | } |
98 | # See if we have at least one reading for each variant. |
99 | my @seen_rdgs = values %$apparatus; |
100 | foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) { |
101 | unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) { |
102 | print STDERR 'No manuscript found with reading "' |
103 | . $apparatus->{$rdg} . |
104 | '" at location ' . $apparatus->{_id} . "\n"; |
105 | # delete $apparatus->{$rdg}; # for now |
106 | } |
107 | } |
108 | } |
109 | # Done with loop, so push the last apparatus. |
110 | push( @app_list, $apparatus ); |
111 | return @app_list; |
112 | } |
113 | |
2ceca8c3 |
114 | =back |
115 | |
116 | =head1 LICENSE |
117 | |
118 | This package is free software and is provided "as is" without express |
119 | or implied warranty. You can redistribute it and/or modify it under |
120 | the same terms as Perl itself. |
121 | |
122 | =head1 AUTHOR |
123 | |
124 | Tara L Andrews, aurum@cpan.org |
125 | |
126 | =cut |
127 | |
b49c4318 |
128 | 1; |
129 | |