Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::CSV; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Text::CSV::Simple; |
b49c4318 |
6 | |
2ceca8c3 |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::CSV |
10 | |
11 | =head1 DESCRIPTION |
12 | |
13 | Parser module for Text::Tradition, given a list of variants as a CSV |
14 | file and a reference text as a plaintext file with appropriate line |
15 | breaks. |
16 | |
17 | =head1 METHODS |
18 | |
19 | =over |
20 | |
52ce987f |
21 | =item B<read> |
2ceca8c3 |
22 | |
52ce987f |
23 | my @apparatus = read( $csv_file ); |
2ceca8c3 |
24 | |
52ce987f |
25 | Takes a CSV file; returns a data structure of apparatus entries to be |
26 | merged with a base text. |
2ceca8c3 |
27 | |
28 | =cut |
b49c4318 |
29 | |
52ce987f |
30 | sub read { |
b49c4318 |
31 | my( $csv_file ) = @_; |
32 | my $parser = Text::CSV::Simple->new(); |
33 | my @fields = qw/ reference text variant type context non_corr non_indep |
34 | length total origin /; |
35 | my @lines = $parser->read_file( $ARGV[0] ); |
36 | my @labels = @{shift( @lines )}; |
37 | push( @fields, @labels[10..$#labels] ); |
38 | |
39 | my $started = 0; |
40 | my $rdg_ctr = 0; |
41 | my $apparatus = {}; |
42 | my @app_list; |
43 | foreach my $line ( @lines ) { |
44 | my $new_lemma = 0; |
45 | if( $line->[0] =~ /^\d/ ) { |
46 | $new_lemma = $started = 1; |
47 | } |
48 | next unless $started; |
49 | |
50 | # Get the lines into their fields. |
51 | my %linehash; |
52 | @linehash{@fields} = @$line; |
53 | |
54 | # Readings can take up multiple lines in the CSV, so append the |
55 | # apparatus to the list, and clear it out, if we have started a |
56 | # new reading. |
57 | if( $new_lemma ) { |
58 | push( @app_list, $apparatus ) if keys %$apparatus; |
59 | $apparatus = { _id => $linehash{reference}, |
60 | }; |
61 | $rdg_ctr = 0; |
62 | } |
63 | # The apparatus has multiple readings, and multiple witnesses per |
64 | # reading. So it's a hashref whose values are listrefs. |
65 | $apparatus->{ 'rdg_0' } = $linehash{ 'text' } |
66 | if $linehash{ 'text' }; |
67 | $apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' }; |
68 | foreach my $attr ( @fields[3..8] ) { |
69 | $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr } |
70 | if $linehash{ $attr }; |
71 | } |
72 | |
73 | foreach my $k ( @fields[10..$#fields] ) { |
74 | my $variant_rdg = $linehash{$k}; |
75 | $k =~ s/\s+\(a\.c\.\)//; |
76 | if( $variant_rdg =~ /^0/ ) { |
77 | $apparatus->{ $k } = 'rdg_0' |
78 | unless exists $apparatus->{ $k }; |
79 | } elsif ( $variant_rdg =~ /^1/ ) { |
80 | $apparatus->{ $k } = 'rdg_' . $rdg_ctr; |
81 | } else { # else for $, we don't list the MS |
82 | warn "Unparsed variant indicator $variant_rdg for $k in " . |
83 | $apparatus->{'_id'} |
84 | unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ ); |
85 | } |
86 | } |
87 | # See if we have at least one reading for each variant. |
88 | my @seen_rdgs = values %$apparatus; |
89 | foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) { |
90 | unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) { |
91 | print STDERR 'No manuscript found with reading "' |
92 | . $apparatus->{$rdg} . |
93 | '" at location ' . $apparatus->{_id} . "\n"; |
94 | # delete $apparatus->{$rdg}; # for now |
95 | } |
96 | } |
97 | } |
98 | # Done with loop, so push the last apparatus. |
99 | push( @app_list, $apparatus ); |
100 | return @app_list; |
101 | } |
102 | |
2ceca8c3 |
103 | =back |
104 | |
105 | =head1 LICENSE |
106 | |
107 | This package is free software and is provided "as is" without express |
108 | or implied warranty. You can redistribute it and/or modify it under |
109 | the same terms as Perl itself. |
110 | |
111 | =head1 AUTHOR |
112 | |
113 | Tara L Andrews, aurum@cpan.org |
114 | |
115 | =cut |
116 | |
b49c4318 |
117 | 1; |
118 | |