[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CSV.pm

package Text::Tradition::Parser::CSV;

use strict;
use warnings;
use Text::CSV::Simple;
use Text::Tradition::Parser::BaseText qw( merge_base );

# Takes a CSV file and a base text; returns a GraphML object.

sub parse {
    my( $graph, $csv_file, $base_text ) = @_;

    # Parse the CSV file into a list of apparatus entries.
    my @app_list = read_csv( $csv_file );
    # Now put the base text onto the graph, and merge in the 
    # apparatus entries.
    merge_base( $graph, $base_text, @app_list );
}

# Takes a CSV file; returns a data structure of apparatus entries to
# be merged with a base text.

sub read_csv {
    my( $csv_file ) = @_;
    my $parser = Text::CSV::Simple->new();
    my @fields = qw/ reference text variant type context non_corr non_indep 
                     length total origin /;
    my @lines = $parser->read_file( $ARGV[0] );
    my @labels = @{shift( @lines )};
    push( @fields, @labels[10..$#labels] );

    my $started = 0;
    my $rdg_ctr = 0;
    my $apparatus = {};
    my @app_list;
    foreach my $line ( @lines ) {
	my $new_lemma = 0;
	if( $line->[0] =~ /^\d/ ) {
	    $new_lemma = $started = 1;
	}
	next unless $started;
	
	# Get the lines into their fields.
	my %linehash;
	@linehash{@fields} = @$line;
	
	# Readings can take up multiple lines in the CSV, so append the
	# apparatus to the list, and clear it out, if we have started a
	# new reading.
	if( $new_lemma ) {
	    push( @app_list, $apparatus ) if keys %$apparatus;
	    $apparatus = { _id => $linehash{reference},
	    };
	    $rdg_ctr = 0;
	}
	# The apparatus has multiple readings, and multiple witnesses per
	# reading.  So it's a hashref whose values are listrefs.
	$apparatus->{ 'rdg_0' } = $linehash{ 'text' } 
        if $linehash{ 'text' };
	$apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' };
	foreach my $attr ( @fields[3..8] ) {
	    $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr }
	    if $linehash{ $attr };
	}
	
	foreach my $k ( @fields[10..$#fields] ) {
	    my $variant_rdg = $linehash{$k};
	    $k =~ s/\s+\(a\.c\.\)//;
	    if( $variant_rdg =~ /^0/ ) {
		$apparatus->{ $k } = 'rdg_0'
		    unless exists $apparatus->{ $k };
	    } elsif ( $variant_rdg =~ /^1/ ) {
		$apparatus->{ $k } = 'rdg_' . $rdg_ctr;
	    } else { # else for $, we don't list the MS
		warn "Unparsed variant indicator $variant_rdg for $k in " .
		    $apparatus->{'_id'}
	        unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ );
	    }
	}
	# See if we have at least one reading for each variant.
	my @seen_rdgs = values %$apparatus;
	foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) {
	    unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) {
		print STDERR 'No manuscript found with reading "' 
		    . $apparatus->{$rdg} .
		    '" at location ' . $apparatus->{_id} . "\n";
		# delete $apparatus->{$rdg}; # for now
	    }
	}
    }
    # Done with loop, so push the last apparatus.
    push( @app_list, $apparatus );
    return @app_list;
}

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::CSV;
b49c4318	2
	3	use strict;
	4	use warnings;
	5	use Text::CSV::Simple;
e58153d6	6	use Text::Tradition::Parser::BaseText qw( merge_base );
b49c4318	7
	8	# Takes a CSV file and a base text; returns a GraphML object.
	9
	10	sub parse {
	11	my( $graph, $csv_file, $base_text ) = @_;
	12
	13	# Parse the CSV file into a list of apparatus entries.
	14	my @app_list = read_csv( $csv_file );
	15	# Now put the base text onto the graph, and merge in the
	16	# apparatus entries.
	17	merge_base( $graph, $base_text, @app_list );
	18	}
	19
	20	# Takes a CSV file; returns a data structure of apparatus entries to
	21	# be merged with a base text.
	22
	23	sub read_csv {
	24	my( $csv_file ) = @_;
	25	my $parser = Text::CSV::Simple->new();
	26	my @fields = qw/ reference text variant type context non_corr non_indep
	27	length total origin /;
	28	my @lines = $parser->read_file( $ARGV[0] );
	29	my @labels = @{shift( @lines )};
	30	push( @fields, @labels[10..$#labels] );
	31
	32	my $started = 0;
	33	my $rdg_ctr = 0;
	34	my $apparatus = {};
	35	my @app_list;
	36	foreach my $line ( @lines ) {
	37	my $new_lemma = 0;
	38	if( $line->[0] =~ /^\d/ ) {
	39	$new_lemma = $started = 1;
	40	}
	41	next unless $started;
	42
	43	# Get the lines into their fields.
	44	my %linehash;
	45	@linehash{@fields} = @$line;
	46
	47	# Readings can take up multiple lines in the CSV, so append the
	48	# apparatus to the list, and clear it out, if we have started a
	49	# new reading.
	50	if( $new_lemma ) {
	51	push( @app_list, $apparatus ) if keys %$apparatus;
	52	$apparatus = { _id => $linehash{reference},
	53	};
	54	$rdg_ctr = 0;
	55	}
	56	# The apparatus has multiple readings, and multiple witnesses per
	57	# reading. So it's a hashref whose values are listrefs.
	58	$apparatus->{ 'rdg_0' } = $linehash{ 'text' }
	59	if $linehash{ 'text' };
	60	$apparatus->{ 'rdg_' . ++$rdg_ctr } = $linehash{ 'variant' };
	61	foreach my $attr ( @fields[3..8] ) {
	62	$apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{ $attr }
	63	if $linehash{ $attr };
	64	}
	65
	66	foreach my $k ( @fields[10..$#fields] ) {
	67	my $variant_rdg = $linehash{$k};
	68	$k =~ s/\s+\(a\.c\.\)//;
	69	if( $variant_rdg =~ /^0/ ) {
	70	$apparatus->{ $k } = 'rdg_0'
71	unless exists $apparatus->{ $k };
72	} elsif ( $variant_rdg =~ /^1/ ) {
73	$apparatus->{ $k } = 'rdg_' . $rdg_ctr;
74	} else { # else for $, we don't list the MS
75	warn "Unparsed variant indicator $variant_rdg for $k in " .
76	$apparatus->{'_id'}
77	unless ( !$variant_rdg or $variant_rdg =~ /^\$$/ );
78	}
79	}
80	# See if we have at least one reading for each variant.
81	my @seen_rdgs = values %$apparatus;
82	foreach my $rdg ( grep { $_ =~ /^rdg/ } keys %$apparatus ) {
83	unless( grep { $_ =~ /^$rdg$/ } @seen_rdgs ) {
84	print STDERR 'No manuscript found with reading "'
85	. $apparatus->{$rdg} .
86	'" at location ' . $apparatus->{_id} . "\n";
87	# delete $apparatus->{$rdg}; # for now
88	}
89	}
90	}
91	# Done with loop, so push the last apparatus.
92	push( @app_list, $apparatus );
93	return @app_list;
94	}
95
96	1;
97