Add some documentation
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CSV.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::CSV;
b49c4318 2
3use strict;
4use warnings;
5use Text::CSV::Simple;
e58153d6 6use Text::Tradition::Parser::BaseText qw( merge_base );
b49c4318 7
2ceca8c3 8=head1 NAME
9
10Text::Tradition::Parser::CSV
11
12=head1 DESCRIPTION
13
14Parser module for Text::Tradition, given a list of variants as a CSV
15file and a reference text as a plaintext file with appropriate line
16breaks.
17
18=head1 METHODS
19
20=over
21
22=item B<parse>
23
24parse( $graph, 'variants.csv', 'reference.txt' );
25
26Takes an initialized Text::Tradition::Graph object and the relevant
27data files; puts the text and its variants onto the graph.
28
29=cut
b49c4318 30
31sub parse {
32 my( $graph, $csv_file, $base_text ) = @_;
33
34 # Parse the CSV file into a list of apparatus entries.
2ceca8c3 35 my @app_list = _read_csv( $csv_file );
b49c4318 36 # Now put the base text onto the graph, and merge in the
37 # apparatus entries.
38 merge_base( $graph, $base_text, @app_list );
39}
40
41# Takes a CSV file; returns a data structure of apparatus entries to
42# be merged with a base text.
43
2ceca8c3 44sub _read_csv {
b49c4318 45 my( $csv_file ) = @_;
46 my $parser = Text::CSV::Simple->new();
47 my @fields = qw/ reference text variant type context non_corr non_indep
48 length total origin /;
49 my @lines = $parser->read_file( $ARGV[0] );
50 my @labels = @{shift( @lines )};
51 push( @fields, @labels[10..$#labels] );
52
53 my $started = 0;
54 my $rdg_ctr = 0;
55 my $apparatus = {};
56 my @app_list;
57 foreach my $line ( @lines ) {
58 my $new_lemma = 0;
59 if( $line->[0] =~ /^\d/ ) {
60 $new_lemma = $started = 1;
61 }
62 next unless $started;
63
64 # Get the lines into their fields.
65 my %linehash;
66 @linehash{@fields} = @$line;
67
68 # Readings can take up multiple lines in the CSV, so append the
69 # apparatus to the list, and clear it out, if we have started a
70 # new reading.
71 if( $new_lemma ) {
72 push( @app_list, $apparatus ) if keys %$apparatus;
73 $apparatus = { _id => $linehash{reference},
74 };
75 $rdg_ctr = 0;
76 }
77 # The apparatus has multiple readings, and multiple witnesses per
78 # reading. So it's a hashref whose values are listrefs.
79 $apparatus->{ 'rdg_0' } = $linehash{ 'text' }
80 if $linehash{ 'text' };
81 $apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' };
82 foreach my $attr ( @fields[3..8] ) {
83 $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr }
84 if $linehash{ $attr };
85 }
86
87 foreach my $k ( @fields[10..$#fields] ) {
88 my $variant_rdg = $linehash{$k};
89 $k =~ s/\s+\(a\.c\.\)//;
90 if( $variant_rdg =~ /^0/ ) {
91 $apparatus->{ $k } = 'rdg_0'
92 unless exists $apparatus->{ $k };
93 } elsif ( $variant_rdg =~ /^1/ ) {
94 $apparatus->{ $k } = 'rdg_' . $rdg_ctr;
95 } else { # else for $, we don't list the MS
96 warn "Unparsed variant indicator $variant_rdg for $k in " .
97 $apparatus->{'_id'}
98 unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ );
99 }
100 }
101 # See if we have at least one reading for each variant.
102 my @seen_rdgs = values %$apparatus;
103 foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) {
104 unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) {
105 print STDERR 'No manuscript found with reading "'
106 . $apparatus->{$rdg} .
107 '" at location ' . $apparatus->{_id} . "\n";
108 # delete $apparatus->{$rdg}; # for now
109 }
110 }
111 }
112 # Done with loop, so push the last apparatus.
113 push( @app_list, $apparatus );
114 return @app_list;
115}
116
2ceca8c3 117=back
118
119=head1 LICENSE
120
121This package is free software and is provided "as is" without express
122or implied warranty. You can redistribute it and/or modify it under
123the same terms as Perl itself.
124
125=head1 AUTHOR
126
127Tara L Andrews, aurum@cpan.org
128
129=cut
130
b49c4318 1311;
132