remove some debugging statements
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / KUL.pm
CommitLineData
910a0a6d 1package Text::Tradition::Parser::KUL;
b49c4318 2
3use strict;
4use warnings;
0e96be5f 5use Storable qw /dclone/;
dfc37e38 6use Text::CSV::Simple; # TODO convert to CSV_XS
b49c4318 7
2ceca8c3 8=head1 NAME
9
e867486f 10Text::Tradition::Parser::KUL
2ceca8c3 11
12=head1 DESCRIPTION
13
14Parser module for Text::Tradition, given a list of variants as a CSV
15file and a reference text as a plaintext file with appropriate line
e867486f 16breaks. The CSV file is a specialized format developed at KU Leuven,
17and other formats are vastly preferred.
2ceca8c3 18
19=head1 METHODS
20
e867486f 21=head2 B<read>
2ceca8c3 22
52ce987f 23my @apparatus = read( $csv_file );
2ceca8c3 24
52ce987f 25Takes a CSV file; returns a data structure of apparatus entries to be
26merged with a base text.
2ceca8c3 27
28=cut
b49c4318 29
52ce987f 30sub read {
dfc37e38 31 my( $opts ) = @_;
b49c4318 32 my $parser = Text::CSV::Simple->new();
33 my @fields = qw/ reference text variant type context non_corr non_indep
34 length total origin /;
dfc37e38 35 my @lines = $parser->read_file( $opts->{'file'} );
b49c4318 36 my @labels = @{shift( @lines )};
37 push( @fields, @labels[10..$#labels] );
38
39 my $started = 0;
40 my $rdg_ctr = 0;
41 my $apparatus = {};
42 my @app_list;
43 foreach my $line ( @lines ) {
44 my $new_lemma = 0;
45 if( $line->[0] =~ /^\d/ ) {
46 $new_lemma = $started = 1;
47 }
48 next unless $started;
49
50 # Get the lines into their fields.
51 my %linehash;
52 @linehash{@fields} = @$line;
53
54 # Readings can take up multiple lines in the CSV, so append the
55 # apparatus to the list, and clear it out, if we have started a
56 # new reading.
57 if( $new_lemma ) {
0e96be5f 58 # Was it a doubled-up apparatus entry e.g 'non (1/2)'?
59 if( keys %$apparatus &&
60 $apparatus->{'rdg_0'} =~ /(.*?)\s+\(?([\d\/]+)\)?\s*$/ ) {
61 my( $reading, $istr ) = ( $1, $2 );
62 my @instances = split( /\//, $istr );
63 foreach my $i ( @instances ) {
64 my $app = dclone( $apparatus );
65 $app->{'rdg_0'} = $reading . "_$i";
66 $app->{'_id'} .= chr(97+$i);
67 push( @app_list, $app );
68 }
69 } elsif( keys %$apparatus ) {
70 push( @app_list, $apparatus );
71 }
72 $apparatus = { _id => $linehash{reference} };
b49c4318 73 $rdg_ctr = 0;
74 }
75 # The apparatus has multiple readings, and multiple witnesses per
76 # reading. So it's a hashref whose values are listrefs.
0e96be5f 77 $apparatus->{'rdg_0'} = $linehash{'text'} if $linehash{'text'};
78 $apparatus->{'rdg_' . ++$rdg_ctr} = $linehash{'variant'};
b49c4318 79 foreach my $attr ( @fields[3..8] ) {
b15511bf 80 $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{$attr} if defined $linehash{$attr};
b49c4318 81 }
82
83 foreach my $k ( @fields[10..$#fields] ) {
84 my $variant_rdg = $linehash{$k};
85 $k =~ s/\s+\(a\.c\.\)//;
86 if( $variant_rdg =~ /^0/ ) {
0e96be5f 87 $apparatus->{$k} = 'rdg_0'
88 unless exists $apparatus->{$k};
b49c4318 89 } elsif ( $variant_rdg =~ /^1/ ) {
b15511bf 90 warn sprintf( "Already found variant reading %s for %s at %s!",
91 $apparatus->{$k}, $k, $apparatus->{_id} )
92 if exists $apparatus->{$k} && $apparatus->{$k} ne 'rdg_0';
0e96be5f 93 $apparatus->{$k} = 'rdg_' . $rdg_ctr;
b49c4318 94 } else { # else for $, we don't list the MS
95 warn "Unparsed variant indicator $variant_rdg for $k in " .
96 $apparatus->{'_id'}
0e96be5f 97 unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ );
b49c4318 98 }
99 }
100 # See if we have at least one reading for each variant.
101 my @seen_rdgs = values %$apparatus;
102 foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) {
103 unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) {
104 print STDERR 'No manuscript found with reading "'
105 . $apparatus->{$rdg} .
106 '" at location ' . $apparatus->{_id} . "\n";
107 # delete $apparatus->{$rdg}; # for now
108 }
109 }
110 }
111 # Done with loop, so push the last apparatus.
112 push( @app_list, $apparatus );
113 return @app_list;
114}
115
2ceca8c3 116=head1 LICENSE
117
118This package is free software and is provided "as is" without express
119or implied warranty. You can redistribute it and/or modify it under
120the same terms as Perl itself.
121
122=head1 AUTHOR
123
124Tara L Andrews, aurum@cpan.org
125
126=cut
127
b49c4318 1281;
129