make the first couple of tests pass
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CSV.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::CSV;
b49c4318 2
3use strict;
4use warnings;
5use Text::CSV::Simple;
b49c4318 6
2ceca8c3 7=head1 NAME
8
9Text::Tradition::Parser::CSV
10
11=head1 DESCRIPTION
12
13Parser module for Text::Tradition, given a list of variants as a CSV
14file and a reference text as a plaintext file with appropriate line
15breaks.
16
17=head1 METHODS
18
19=over
20
52ce987f 21=item B<read>
2ceca8c3 22
52ce987f 23my @apparatus = read( $csv_file );
2ceca8c3 24
52ce987f 25Takes a CSV file; returns a data structure of apparatus entries to be
26merged with a base text.
2ceca8c3 27
28=cut
b49c4318 29
52ce987f 30sub read {
b49c4318 31 my( $csv_file ) = @_;
32 my $parser = Text::CSV::Simple->new();
33 my @fields = qw/ reference text variant type context non_corr non_indep
34 length total origin /;
35 my @lines = $parser->read_file( $ARGV[0] );
36 my @labels = @{shift( @lines )};
37 push( @fields, @labels[10..$#labels] );
38
39 my $started = 0;
40 my $rdg_ctr = 0;
41 my $apparatus = {};
42 my @app_list;
43 foreach my $line ( @lines ) {
44 my $new_lemma = 0;
45 if( $line->[0] =~ /^\d/ ) {
46 $new_lemma = $started = 1;
47 }
48 next unless $started;
49
50 # Get the lines into their fields.
51 my %linehash;
52 @linehash{@fields} = @$line;
53
54 # Readings can take up multiple lines in the CSV, so append the
55 # apparatus to the list, and clear it out, if we have started a
56 # new reading.
57 if( $new_lemma ) {
58 push( @app_list, $apparatus ) if keys %$apparatus;
59 $apparatus = { _id => $linehash{reference},
60 };
61 $rdg_ctr = 0;
62 }
63 # The apparatus has multiple readings, and multiple witnesses per
64 # reading. So it's a hashref whose values are listrefs.
65 $apparatus->{ 'rdg_0' } = $linehash{ 'text' }
66 if $linehash{ 'text' };
67 $apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' };
68 foreach my $attr ( @fields[3..8] ) {
69 $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr }
70 if $linehash{ $attr };
71 }
72
73 foreach my $k ( @fields[10..$#fields] ) {
74 my $variant_rdg = $linehash{$k};
75 $k =~ s/\s+\(a\.c\.\)//;
76 if( $variant_rdg =~ /^0/ ) {
77 $apparatus->{ $k } = 'rdg_0'
78 unless exists $apparatus->{ $k };
79 } elsif ( $variant_rdg =~ /^1/ ) {
80 $apparatus->{ $k } = 'rdg_' . $rdg_ctr;
81 } else { # else for $, we don't list the MS
82 warn "Unparsed variant indicator $variant_rdg for $k in " .
83 $apparatus->{'_id'}
84 unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ );
85 }
86 }
87 # See if we have at least one reading for each variant.
88 my @seen_rdgs = values %$apparatus;
89 foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) {
90 unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) {
91 print STDERR 'No manuscript found with reading "'
92 . $apparatus->{$rdg} .
93 '" at location ' . $apparatus->{_id} . "\n";
94 # delete $apparatus->{$rdg}; # for now
95 }
96 }
97 }
98 # Done with loop, so push the last apparatus.
99 push( @app_list, $apparatus );
100 return @app_list;
101}
102
2ceca8c3 103=back
104
105=head1 LICENSE
106
107This package is free software and is provided "as is" without express
108or implied warranty. You can redistribute it and/or modify it under
109the same terms as Perl itself.
110
111=head1 AUTHOR
112
113Tara L Andrews, aurum@cpan.org
114
115=cut
116
b49c4318 1171;
118