...and fix the namespace in the tests
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CSV.pm
CommitLineData
e58153d6 1package Text::Tradition::Parser::CSV;
b49c4318 2
3use strict;
4use warnings;
5use Text::CSV::Simple;
e58153d6 6use Text::Tradition::Parser::BaseText qw( merge_base );
b49c4318 7
8# Takes a CSV file and a base text; returns a GraphML object.
9
10sub parse {
11 my( $graph, $csv_file, $base_text ) = @_;
12
13 # Parse the CSV file into a list of apparatus entries.
14 my @app_list = read_csv( $csv_file );
15 # Now put the base text onto the graph, and merge in the
16 # apparatus entries.
17 merge_base( $graph, $base_text, @app_list );
18}
19
20# Takes a CSV file; returns a data structure of apparatus entries to
21# be merged with a base text.
22
23sub read_csv {
24 my( $csv_file ) = @_;
25 my $parser = Text::CSV::Simple->new();
26 my @fields = qw/ reference text variant type context non_corr non_indep
27 length total origin /;
28 my @lines = $parser->read_file( $ARGV[0] );
29 my @labels = @{shift( @lines )};
30 push( @fields, @labels[10..$#labels] );
31
32 my $started = 0;
33 my $rdg_ctr = 0;
34 my $apparatus = {};
35 my @app_list;
36 foreach my $line ( @lines ) {
37 my $new_lemma = 0;
38 if( $line->[0] =~ /^\d/ ) {
39 $new_lemma = $started = 1;
40 }
41 next unless $started;
42
43 # Get the lines into their fields.
44 my %linehash;
45 @linehash{@fields} = @$line;
46
47 # Readings can take up multiple lines in the CSV, so append the
48 # apparatus to the list, and clear it out, if we have started a
49 # new reading.
50 if( $new_lemma ) {
51 push( @app_list, $apparatus ) if keys %$apparatus;
52 $apparatus = { _id => $linehash{reference},
53 };
54 $rdg_ctr = 0;
55 }
56 # The apparatus has multiple readings, and multiple witnesses per
57 # reading. So it's a hashref whose values are listrefs.
58 $apparatus->{ 'rdg_0' } = $linehash{ 'text' }
59 if $linehash{ 'text' };
60 $apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' };
61 foreach my $attr ( @fields[3..8] ) {
62 $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr }
63 if $linehash{ $attr };
64 }
65
66 foreach my $k ( @fields[10..$#fields] ) {
67 my $variant_rdg = $linehash{$k};
68 $k =~ s/\s+\(a\.c\.\)//;
69 if( $variant_rdg =~ /^0/ ) {
70 $apparatus->{ $k } = 'rdg_0'
71 unless exists $apparatus->{ $k };
72 } elsif ( $variant_rdg =~ /^1/ ) {
73 $apparatus->{ $k } = 'rdg_' . $rdg_ctr;
74 } else { # else for $, we don't list the MS
75 warn "Unparsed variant indicator $variant_rdg for $k in " .
76 $apparatus->{'_id'}
77 unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ );
78 }
79 }
80 # See if we have at least one reading for each variant.
81 my @seen_rdgs = values %$apparatus;
82 foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) {
83 unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) {
84 print STDERR 'No manuscript found with reading "'
85 . $apparatus->{$rdg} .
86 '" at location ' . $apparatus->{_id} . "\n";
87 # delete $apparatus->{$rdg}; # for now
88 }
89 }
90 }
91 # Done with loop, so push the last apparatus.
92 push( @app_list, $apparatus );
93 return @app_list;
94}
95
961;
97