Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::CSV; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Text::CSV::Simple; |
e58153d6 |
6 | use Text::Tradition::Parser::BaseText qw( merge_base ); |
b49c4318 |
7 | |
8 | # Takes a CSV file and a base text; returns a GraphML object. |
9 | |
10 | sub parse { |
11 | my( $graph, $csv_file, $base_text ) = @_; |
12 | |
13 | # Parse the CSV file into a list of apparatus entries. |
14 | my @app_list = read_csv( $csv_file ); |
15 | # Now put the base text onto the graph, and merge in the |
16 | # apparatus entries. |
17 | merge_base( $graph, $base_text, @app_list ); |
18 | } |
19 | |
20 | # Takes a CSV file; returns a data structure of apparatus entries to |
21 | # be merged with a base text. |
22 | |
23 | sub read_csv { |
24 | my( $csv_file ) = @_; |
25 | my $parser = Text::CSV::Simple->new(); |
26 | my @fields = qw/ reference text variant type context non_corr non_indep |
27 | length total origin /; |
28 | my @lines = $parser->read_file( $ARGV[0] ); |
29 | my @labels = @{shift( @lines )}; |
30 | push( @fields, @labels[10..$#labels] ); |
31 | |
32 | my $started = 0; |
33 | my $rdg_ctr = 0; |
34 | my $apparatus = {}; |
35 | my @app_list; |
36 | foreach my $line ( @lines ) { |
37 | my $new_lemma = 0; |
38 | if( $line->[0] =~ /^\d/ ) { |
39 | $new_lemma = $started = 1; |
40 | } |
41 | next unless $started; |
42 | |
43 | # Get the lines into their fields. |
44 | my %linehash; |
45 | @linehash{@fields} = @$line; |
46 | |
47 | # Readings can take up multiple lines in the CSV, so append the |
48 | # apparatus to the list, and clear it out, if we have started a |
49 | # new reading. |
50 | if( $new_lemma ) { |
51 | push( @app_list, $apparatus ) if keys %$apparatus; |
52 | $apparatus = { _id => $linehash{reference}, |
53 | }; |
54 | $rdg_ctr = 0; |
55 | } |
56 | # The apparatus has multiple readings, and multiple witnesses per |
57 | # reading. So it's a hashref whose values are listrefs. |
58 | $apparatus->{ 'rdg_0' } = $linehash{ 'text' } |
59 | if $linehash{ 'text' }; |
60 | $apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' }; |
61 | foreach my $attr ( @fields[3..8] ) { |
62 | $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr } |
63 | if $linehash{ $attr }; |
64 | } |
65 | |
66 | foreach my $k ( @fields[10..$#fields] ) { |
67 | my $variant_rdg = $linehash{$k}; |
68 | $k =~ s/\s+\(a\.c\.\)//; |
69 | if( $variant_rdg =~ /^0/ ) { |
70 | $apparatus->{ $k } = 'rdg_0' |
71 | unless exists $apparatus->{ $k }; |
72 | } elsif ( $variant_rdg =~ /^1/ ) { |
73 | $apparatus->{ $k } = 'rdg_' . $rdg_ctr; |
74 | } else { # else for $, we don't list the MS |
75 | warn "Unparsed variant indicator $variant_rdg for $k in " . |
76 | $apparatus->{'_id'} |
77 | unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ ); |
78 | } |
79 | } |
80 | # See if we have at least one reading for each variant. |
81 | my @seen_rdgs = values %$apparatus; |
82 | foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) { |
83 | unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) { |
84 | print STDERR 'No manuscript found with reading "' |
85 | . $apparatus->{$rdg} . |
86 | '" at location ' . $apparatus->{_id} . "\n"; |
87 | # delete $apparatus->{$rdg}; # for now |
88 | } |
89 | } |
90 | } |
91 | # Done with loop, so push the last apparatus. |
92 | push( @app_list, $apparatus ); |
93 | return @app_list; |
94 | } |
95 | |
96 | 1; |
97 | |