Commit | Line | Data |
910a0a6d |
1 | package Text::Tradition::Parser::KUL; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
0e96be5f |
5 | use Storable qw /dclone/; |
dfc37e38 |
6 | use Text::CSV::Simple; # TODO convert to CSV_XS |
b49c4318 |
7 | |
2ceca8c3 |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::CSV |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a list of variants as a CSV |
15 | file and a reference text as a plaintext file with appropriate line |
16 | breaks. |
17 | |
18 | =head1 METHODS |
19 | |
20 | =over |
21 | |
52ce987f |
22 | =item B<read> |
2ceca8c3 |
23 | |
52ce987f |
24 | my @apparatus = read( $csv_file ); |
2ceca8c3 |
25 | |
52ce987f |
26 | Takes a CSV file; returns a data structure of apparatus entries to be |
27 | merged with a base text. |
2ceca8c3 |
28 | |
29 | =cut |
b49c4318 |
30 | |
52ce987f |
31 | sub read { |
dfc37e38 |
32 | my( $opts ) = @_; |
b49c4318 |
33 | my $parser = Text::CSV::Simple->new(); |
34 | my @fields = qw/ reference text variant type context non_corr non_indep |
35 | length total origin /; |
dfc37e38 |
36 | my @lines = $parser->read_file( $opts->{'file'} ); |
b49c4318 |
37 | my @labels = @{shift( @lines )}; |
38 | push( @fields, @labels[10..$#labels] ); |
39 | |
40 | my $started = 0; |
41 | my $rdg_ctr = 0; |
42 | my $apparatus = {}; |
43 | my @app_list; |
44 | foreach my $line ( @lines ) { |
45 | my $new_lemma = 0; |
46 | if( $line->[0] =~ /^\d/ ) { |
47 | $new_lemma = $started = 1; |
48 | } |
49 | next unless $started; |
50 | |
51 | # Get the lines into their fields. |
52 | my %linehash; |
53 | @linehash{@fields} = @$line; |
54 | |
55 | # Readings can take up multiple lines in the CSV, so append the |
56 | # apparatus to the list, and clear it out, if we have started a |
57 | # new reading. |
58 | if( $new_lemma ) { |
0e96be5f |
59 | # Was it a doubled-up apparatus entry e.g 'non (1/2)'? |
60 | if( keys %$apparatus && |
61 | $apparatus->{'rdg_0'} =~ /(.*?)\s+\(?([\d\/]+)\)?\s*$/ ) { |
62 | my( $reading, $istr ) = ( $1, $2 ); |
63 | my @instances = split( /\//, $istr ); |
64 | foreach my $i ( @instances ) { |
65 | my $app = dclone( $apparatus ); |
66 | $app->{'rdg_0'} = $reading . "_$i"; |
67 | $app->{'_id'} .= chr(97+$i); |
68 | push( @app_list, $app ); |
69 | } |
70 | } elsif( keys %$apparatus ) { |
71 | push( @app_list, $apparatus ); |
72 | } |
73 | $apparatus = { _id => $linehash{reference} }; |
b49c4318 |
74 | $rdg_ctr = 0; |
75 | } |
76 | # The apparatus has multiple readings, and multiple witnesses per |
77 | # reading. So it's a hashref whose values are listrefs. |
0e96be5f |
78 | $apparatus->{'rdg_0'} = $linehash{'text'} if $linehash{'text'}; |
79 | $apparatus->{'rdg_' . ++$rdg_ctr} = $linehash{'variant'}; |
b49c4318 |
80 | foreach my $attr ( @fields[3..8] ) { |
b15511bf |
81 | $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{$attr} if defined $linehash{$attr}; |
b49c4318 |
82 | } |
83 | |
84 | foreach my $k ( @fields[10..$#fields] ) { |
85 | my $variant_rdg = $linehash{$k}; |
86 | $k =~ s/\s+\(a\.c\.\)//; |
87 | if( $variant_rdg =~ /^0/ ) { |
0e96be5f |
88 | $apparatus->{$k} = 'rdg_0' |
89 | unless exists $apparatus->{$k}; |
b49c4318 |
90 | } elsif ( $variant_rdg =~ /^1/ ) { |
b15511bf |
91 | warn sprintf( "Already found variant reading %s for %s at %s!", |
92 | $apparatus->{$k}, $k, $apparatus->{_id} ) |
93 | if exists $apparatus->{$k} && $apparatus->{$k} ne 'rdg_0'; |
0e96be5f |
94 | $apparatus->{$k} = 'rdg_' . $rdg_ctr; |
b49c4318 |
95 | } else { # else for $, we don't list the MS |
96 | warn "Unparsed variant indicator $variant_rdg for $k in " . |
97 | $apparatus->{'_id'} |
0e96be5f |
98 | unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ ); |
b49c4318 |
99 | } |
100 | } |
101 | # See if we have at least one reading for each variant. |
102 | my @seen_rdgs = values %$apparatus; |
103 | foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) { |
104 | unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) { |
105 | print STDERR 'No manuscript found with reading "' |
106 | . $apparatus->{$rdg} . |
107 | '" at location ' . $apparatus->{_id} . "\n"; |
108 | # delete $apparatus->{$rdg}; # for now |
109 | } |
110 | } |
111 | } |
112 | # Done with loop, so push the last apparatus. |
113 | push( @app_list, $apparatus ); |
114 | return @app_list; |
115 | } |
116 | |
2ceca8c3 |
117 | =back |
118 | |
119 | =head1 LICENSE |
120 | |
121 | This package is free software and is provided "as is" without express |
122 | or implied warranty. You can redistribute it and/or modify it under |
123 | the same terms as Perl itself. |
124 | |
125 | =head1 AUTHOR |
126 | |
127 | Tara L Andrews, aurum@cpan.org |
128 | |
129 | =cut |
130 | |
b49c4318 |
131 | 1; |
132 | |