Commit | Line | Data |
910a0a6d |
1 | package Text::Tradition::Parser::KUL; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
0e96be5f |
5 | use Storable qw /dclone/; |
dfc37e38 |
6 | use Text::CSV::Simple; # TODO convert to CSV_XS |
b49c4318 |
7 | |
2ceca8c3 |
8 | =head1 NAME |
9 | |
e867486f |
10 | Text::Tradition::Parser::KUL |
2ceca8c3 |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a list of variants as a CSV |
15 | file and a reference text as a plaintext file with appropriate line |
e867486f |
16 | breaks. The CSV file is a specialized format developed at KU Leuven, |
17 | and other formats are vastly preferred. |
2ceca8c3 |
18 | |
19 | =head1 METHODS |
20 | |
e867486f |
21 | =head2 B<read> |
2ceca8c3 |
22 | |
52ce987f |
23 | my @apparatus = read( $csv_file ); |
2ceca8c3 |
24 | |
52ce987f |
25 | Takes a CSV file; returns a data structure of apparatus entries to be |
26 | merged with a base text. |
2ceca8c3 |
27 | |
28 | =cut |
b49c4318 |
29 | |
52ce987f |
30 | sub read { |
dfc37e38 |
31 | my( $opts ) = @_; |
b49c4318 |
32 | my $parser = Text::CSV::Simple->new(); |
33 | my @fields = qw/ reference text variant type context non_corr non_indep |
34 | length total origin /; |
dfc37e38 |
35 | my @lines = $parser->read_file( $opts->{'file'} ); |
b49c4318 |
36 | my @labels = @{shift( @lines )}; |
37 | push( @fields, @labels[10..$#labels] ); |
38 | |
39 | my $started = 0; |
40 | my $rdg_ctr = 0; |
41 | my $apparatus = {}; |
42 | my @app_list; |
43 | foreach my $line ( @lines ) { |
44 | my $new_lemma = 0; |
45 | if( $line->[0] =~ /^\d/ ) { |
46 | $new_lemma = $started = 1; |
47 | } |
48 | next unless $started; |
49 | |
50 | # Get the lines into their fields. |
51 | my %linehash; |
52 | @linehash{@fields} = @$line; |
53 | |
54 | # Readings can take up multiple lines in the CSV, so append the |
55 | # apparatus to the list, and clear it out, if we have started a |
56 | # new reading. |
57 | if( $new_lemma ) { |
0e96be5f |
58 | # Was it a doubled-up apparatus entry e.g 'non (1/2)'? |
59 | if( keys %$apparatus && |
60 | $apparatus->{'rdg_0'} =~ /(.*?)\s+\(?([\d\/]+)\)?\s*$/ ) { |
61 | my( $reading, $istr ) = ( $1, $2 ); |
62 | my @instances = split( /\//, $istr ); |
63 | foreach my $i ( @instances ) { |
64 | my $app = dclone( $apparatus ); |
65 | $app->{'rdg_0'} = $reading . "_$i"; |
66 | $app->{'_id'} .= chr(97+$i); |
67 | push( @app_list, $app ); |
68 | } |
69 | } elsif( keys %$apparatus ) { |
70 | push( @app_list, $apparatus ); |
71 | } |
72 | $apparatus = { _id => $linehash{reference} }; |
b49c4318 |
73 | $rdg_ctr = 0; |
74 | } |
75 | # The apparatus has multiple readings, and multiple witnesses per |
76 | # reading. So it's a hashref whose values are listrefs. |
0e96be5f |
77 | $apparatus->{'rdg_0'} = $linehash{'text'} if $linehash{'text'}; |
78 | $apparatus->{'rdg_' . ++$rdg_ctr} = $linehash{'variant'}; |
b49c4318 |
79 | foreach my $attr ( @fields[3..8] ) { |
b15511bf |
80 | $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{$attr} if defined $linehash{$attr}; |
b49c4318 |
81 | } |
82 | |
83 | foreach my $k ( @fields[10..$#fields] ) { |
84 | my $variant_rdg = $linehash{$k}; |
85 | $k =~ s/\s+\(a\.c\.\)//; |
86 | if( $variant_rdg =~ /^0/ ) { |
0e96be5f |
87 | $apparatus->{$k} = 'rdg_0' |
88 | unless exists $apparatus->{$k}; |
b49c4318 |
89 | } elsif ( $variant_rdg =~ /^1/ ) { |
b15511bf |
90 | warn sprintf( "Already found variant reading %s for %s at %s!", |
91 | $apparatus->{$k}, $k, $apparatus->{_id} ) |
92 | if exists $apparatus->{$k} && $apparatus->{$k} ne 'rdg_0'; |
0e96be5f |
93 | $apparatus->{$k} = 'rdg_' . $rdg_ctr; |
b49c4318 |
94 | } else { # else for $, we don't list the MS |
95 | warn "Unparsed variant indicator $variant_rdg for $k in " . |
96 | $apparatus->{'_id'} |
0e96be5f |
97 | unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ ); |
b49c4318 |
98 | } |
99 | } |
100 | # See if we have at least one reading for each variant. |
101 | my @seen_rdgs = values %$apparatus; |
102 | foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) { |
103 | unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) { |
104 | print STDERR 'No manuscript found with reading "' |
105 | . $apparatus->{$rdg} . |
106 | '" at location ' . $apparatus->{_id} . "\n"; |
107 | # delete $apparatus->{$rdg}; # for now |
108 | } |
109 | } |
110 | } |
111 | # Done with loop, so push the last apparatus. |
112 | push( @app_list, $apparatus ); |
113 | return @app_list; |
114 | } |
115 | |
2ceca8c3 |
116 | =head1 LICENSE |
117 | |
118 | This package is free software and is provided "as is" without express |
119 | or implied warranty. You can redistribute it and/or modify it under |
120 | the same terms as Perl itself. |
121 | |
122 | =head1 AUTHOR |
123 | |
124 | Tara L Andrews, aurum@cpan.org |
125 | |
126 | =cut |
127 | |
b49c4318 |
128 | 1; |
129 | |