Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::CSV; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Text::CSV::Simple; |
e58153d6 |
6 | use Text::Tradition::Parser::BaseText qw( merge_base ); |
b49c4318 |
7 | |
2ceca8c3 |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::CSV |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a list of variants as a CSV |
15 | file and a reference text as a plaintext file with appropriate line |
16 | breaks. |
17 | |
18 | =head1 METHODS |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $graph, 'variants.csv', 'reference.txt' ); |
25 | |
26 | Takes an initialized Text::Tradition::Graph object and the relevant |
27 | data files; puts the text and its variants onto the graph. |
28 | |
29 | =cut |
b49c4318 |
30 | |
31 | sub parse { |
32 | my( $graph, $csv_file, $base_text ) = @_; |
33 | |
34 | # Parse the CSV file into a list of apparatus entries. |
2ceca8c3 |
35 | my @app_list = _read_csv( $csv_file ); |
b49c4318 |
36 | # Now put the base text onto the graph, and merge in the |
37 | # apparatus entries. |
38 | merge_base( $graph, $base_text, @app_list ); |
39 | } |
40 | |
41 | # Takes a CSV file; returns a data structure of apparatus entries to |
42 | # be merged with a base text. |
43 | |
2ceca8c3 |
44 | sub _read_csv { |
b49c4318 |
45 | my( $csv_file ) = @_; |
46 | my $parser = Text::CSV::Simple->new(); |
47 | my @fields = qw/ reference text variant type context non_corr non_indep |
48 | length total origin /; |
49 | my @lines = $parser->read_file( $ARGV[0] ); |
50 | my @labels = @{shift( @lines )}; |
51 | push( @fields, @labels[10..$#labels] ); |
52 | |
53 | my $started = 0; |
54 | my $rdg_ctr = 0; |
55 | my $apparatus = {}; |
56 | my @app_list; |
57 | foreach my $line ( @lines ) { |
58 | my $new_lemma = 0; |
59 | if( $line->[0] =~ /^\d/ ) { |
60 | $new_lemma = $started = 1; |
61 | } |
62 | next unless $started; |
63 | |
64 | # Get the lines into their fields. |
65 | my %linehash; |
66 | @linehash{@fields} = @$line; |
67 | |
68 | # Readings can take up multiple lines in the CSV, so append the |
69 | # apparatus to the list, and clear it out, if we have started a |
70 | # new reading. |
71 | if( $new_lemma ) { |
72 | push( @app_list, $apparatus ) if keys %$apparatus; |
73 | $apparatus = { _id => $linehash{reference}, |
74 | }; |
75 | $rdg_ctr = 0; |
76 | } |
77 | # The apparatus has multiple readings, and multiple witnesses per |
78 | # reading. So it's a hashref whose values are listrefs. |
79 | $apparatus->{ 'rdg_0' } = $linehash{ 'text' } |
80 | if $linehash{ 'text' }; |
81 | $apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' }; |
82 | foreach my $attr ( @fields[3..8] ) { |
83 | $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr } |
84 | if $linehash{ $attr }; |
85 | } |
86 | |
87 | foreach my $k ( @fields[10..$#fields] ) { |
88 | my $variant_rdg = $linehash{$k}; |
89 | $k =~ s/\s+\(a\.c\.\)//; |
90 | if( $variant_rdg =~ /^0/ ) { |
91 | $apparatus->{ $k } = 'rdg_0' |
92 | unless exists $apparatus->{ $k }; |
93 | } elsif ( $variant_rdg =~ /^1/ ) { |
94 | $apparatus->{ $k } = 'rdg_' . $rdg_ctr; |
95 | } else { # else for $, we don't list the MS |
96 | warn "Unparsed variant indicator $variant_rdg for $k in " . |
97 | $apparatus->{'_id'} |
98 | unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ ); |
99 | } |
100 | } |
101 | # See if we have at least one reading for each variant. |
102 | my @seen_rdgs = values %$apparatus; |
103 | foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) { |
104 | unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) { |
105 | print STDERR 'No manuscript found with reading "' |
106 | . $apparatus->{$rdg} . |
107 | '" at location ' . $apparatus->{_id} . "\n"; |
108 | # delete $apparatus->{$rdg}; # for now |
109 | } |
110 | } |
111 | } |
112 | # Done with loop, so push the last apparatus. |
113 | push( @app_list, $apparatus ); |
114 | return @app_list; |
115 | } |
116 | |
2ceca8c3 |
117 | =back |
118 | |
119 | =head1 LICENSE |
120 | |
121 | This package is free software and is provided "as is" without express |
122 | or implied warranty. You can redistribute it and/or modify it under |
123 | the same terms as Perl itself. |
124 | |
125 | =head1 AUTHOR |
126 | |
127 | Tara L Andrews, aurum@cpan.org |
128 | |
129 | =cut |
130 | |
b49c4318 |
131 | 1; |
132 | |