[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Tabular.pm

package Text::Tradition::Parser::Tabular;

use strict;
use warnings;
use Encode qw/ decode_utf8 /;
use Text::CSV;
use Text::Tradition::Error;
use TryCatch;

=head1 NAME

Text::Tradition::Parser::Tabular

=head1 SYNOPSIS

  use Text::Tradition;
  
  my $t_from_file = Text::Tradition->new( 
    'name' => 'my text',
    'input' => 'Tabular',
    'file' => '/path/to/collation.csv',
    'sep_char' => ','
    );
    
  my $t_from_string = Text::Tradition->new( 
    'name' => 'my text',
    'input' => 'Tabular',
    'string' => $tab_separated_collation,
    'sep_char' => "\t",
    );

=head1 DESCRIPTION

Parser module for Text::Tradition to read an alignment table format, such as 
CSV or Excel.

=head1 METHODS

=head2 B<parse>( $tradition, $option_hash )

Takes an initialized tradition and a set of options; creates the
appropriate nodes and edges on the graph, as well as the appropriate
witness objects.  The $option_hash can contain the following:

=over

=item * file - Name of file which contains the data

=item * string - A string that itself contains the data. One of 'file' or 
'string' is required.

=item * sep_char - For plaintext formats, the field separation character.
Defaults to "\t" (tab-separated); should be "," for comma-separated format.

=item * excel - If the data is in an Excel file, this option should be set
to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)

=back

The data should comprise a table with witnesses arranged in columns, with
the witness sigla in the first row.  Empty cells are interpreted as
omissions (and thus stemmatologically relevant.) Longer lacunae in the
text, to be disregarded in cladistic analysis, may be represented by
filling the appropriate cells with the tag '#LACUNA#'.

If a witness name ends in the collation's ac_label, it will be treated as
an 'ante-correction' version of the 'main' witness whose sigil it shares.

=begin testing

use Text::Tradition;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
eval { no warnings; binmode $DB::OUT, ":utf8"; };

my $csv = 't/data/florilegium.csv';
my $t = Text::Tradition->new( 
    'name'  => 'inline', 
    'input' => 'Tabular',
    'file'  => $csv,
    'sep_char' => ',',
    );

is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );

### TODO Check these figures
if( $t ) {
    is( scalar $t->collation->readings, 311, "Collation has all readings" );
    is( scalar $t->collation->paths, 361, "Collation has all paths" );
    is( scalar $t->witnesses, 13, "Collation has all witnesses" );
}

# Check that we have the right witnesses
my %seen_wits;
map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
foreach my $wit ( $t->witnesses ) {
	$seen_wits{$wit->sigil} = 1;
}
is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
foreach my $k ( keys %seen_wits ) {
	ok( $seen_wits{$k}, "Witness $k still exists" );
}

# Check that the witnesses have the right texts
foreach my $wit ( $t->witnesses ) {
	my $origtext = join( ' ', @{$wit->text} );
	my $graphtext = $t->collation->path_text( $wit->sigil );
	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
}

# Check that the a.c. witnesses have the right text
map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
foreach my $k ( keys %seen_wits ) {
	my $wit = $t->witness( $k );
	if( $seen_wits{$k} ) {
		ok( $wit->is_layered, "Witness $k got marked as layered" );
		ok( $wit->has_layertext, "Witness $k has an a.c. version" );
		my $origtext = join( ' ', @{$wit->layertext} );
		my $acsig = $wit->sigil . $t->collation->ac_label;
		my $graphtext = $t->collation->path_text( $acsig );
		is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
	} else {
		ok( !$wit->is_layered, "Witness $k not marked as layered" );
		ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
	}
}	

# Check that we only have collation relationships where we need them
is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );

## Check excel parsing

my $xls = 't/data/armexample.xls';
my $xt = Text::Tradition->new(
	'name'  => 'excel test',
	'input' => 'Tabular',
	'file'  => $xls,
	'excel'   => 'xls'
	);

is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
my %xls_wits;
map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
foreach my $wit ( $xt->witnesses ) {
	$xls_wits{$wit->sigil} = 1;
}
is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
foreach my $k ( keys %xls_wits ) {
	ok( $xls_wits{$k}, "Witness $k still exists" );
}
is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
is( $xt->collation->reading('r5.1')->text, "\x{587}", 
	"Correct decoding of at least one reading" );

my $xlsx = 't/data/armexample.xlsx';
my $xtx = Text::Tradition->new(
	'name'  => 'excel test',
	'input' => 'Tabular',
	'file'  => $xlsx,
	'excel'   => 'xlsx'
	);

is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
my %xlsx_wits;
map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
foreach my $wit ( $xtx->witnesses ) {
	$xlsx_wits{$wit->sigil} = 1;
}
is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
foreach my $k ( keys %xlsx_wits ) {
	ok( $xlsx_wits{$k}, "Witness $k still exists" );
}
is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
is( $xtx->collation->reading('r5.1')->text, "\x{587}", 
	"Correct decoding of at least one reading" );

=end testing

=cut

sub parse {
    my( $tradition, $opts ) = @_;
    my $alignment_table = _table_from_input( $opts );
    # Set up the witnesses we find in the first line
    my @witnesses;
    my %ac_wits;  # Track layered witness -> main witness mapping
    my $c = $tradition->collation; # shorthand
    my $aclabel = $c->ac_label;
    foreach my $sigil ( @{$alignment_table->[0]} ) {
        if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
        	# Sanitize the sigil name to an XML name
        	$sigil = $1 . '_layered';
            $ac_wits{$sigil} = $1;
        }
        my $wit = $tradition->add_witness( 
        	'sigil' => $sigil, 'sourcetype' => 'collation' );
        $wit->path( [ $c->start ] );
        push( @witnesses, $wit );
        my $aclabel = $c->ac_label;
    }
    
    # Save the original witness text sequences. Have to loop back through
    # the witness columns after we have identified all the a.c. witnesses.
    foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
    	my @sequence = map { $_->[$idx] } @{$alignment_table};
    	my $sigil = shift @sequence;
    	my $is_layer = exists( $ac_wits{$sigil} );
    	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );	
    	# Now get rid of gaps and meta-readings like #LACUNA#
    	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
    	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
    }    
    
    my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
    print STDERR "Tradition too big for row collation\n" if $nocollate;
    
    # Now for the next rows, make nodes as necessary, assign their ranks, and 
    # add them to the witness paths.
    foreach my $idx ( 1 .. $#{$alignment_table} ) {
        my $row = $alignment_table->[$idx];
        my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
        foreach my $w ( 0 .. $#{$row} ) {
            # push the appropriate node onto the appropriate witness path
            my $word = $row->[$w];
            if( $word ) {
                my $reading = $nodes->{$word};
                my $wit = $witnesses[$w];
                push( @{$wit->path}, $reading );
            } # else skip it for empty readings.
        }
    }
    
    # Collapse our lacunae into a single node and
    # push the end node onto all paths.
    $c->end->rank( scalar @$alignment_table );
    foreach my $wit ( @witnesses ) {
        my $p = $wit->path;
        my $last_rdg = shift @$p;
        my $new_p = [ $last_rdg ];
        foreach my $rdg ( @$p ) {
        	# Omit the reading if we are in a lacuna already.
        	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
			# Save the reading otherwise.
			push( @$new_p, $rdg );
			$last_rdg = $rdg;
        }
        push( @$new_p, $c->end );
        $wit->path( $new_p );
    }
    
    # Fold any a.c. witnesses into their main witness objects, and
    # delete the independent a.c. versions.
    foreach my $a ( keys %ac_wits ) {
    	my $ac_wit = $tradition->witness( $a );
        my $main_wit = $tradition->witness( $ac_wits{$a} );
        next unless $main_wit;
        $main_wit->is_layered(1);
        $main_wit->uncorrected_path( $ac_wit->path );
        $tradition->del_witness( $ac_wit );
    }
    
    # Join up the paths.
    $c->make_witness_paths;
    # Delete our unused lacuna nodes.
	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
		$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
	}
	
	# Do a consistency check.
	foreach my $wit ( $tradition->witnesses ) {
		my $pathtext = $c->path_text( $wit->sigil );
		my $origtext = join( ' ', @{$wit->text} );
		warn "Text differs for witness " . $wit->sigil 
			unless $pathtext eq $origtext;
		if( $wit->is_layered ) {
			$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
			$origtext = join( ' ', @{$wit->layertext} );
			warn "Ante-corr text differs for witness " . $wit->sigil
				unless $pathtext eq $origtext;
		} else {
			warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
				if $wit->has_layertext;
		}
	}
	
	# Note that our ranks and common readings are set.
	$c->_graphcalc_done(1);
	# Remove redundant collation relationships.
	$c->relations->filter_collations() unless $nocollate;
}

sub _table_from_input {
	my $opts = shift;
	my $alignment_table = [];
    if( $opts->{'excel'} ) {
    	my $sheet;
    	my $need_decode;
		unless( exists $opts->{'file'} ) {
			throw( "Must pass the filename for Excel parsing" );
		}
    	if( $opts->{'excel'} eq 'xls' ) {
			try {
				require Spreadsheet::ParseExcel;
			} catch {
				throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
			}
			my $parser = Spreadsheet::ParseExcel->new();
			my $workbook = $parser->parse( $opts->{'file'} );
			unless( defined $workbook && defined $workbook->worksheet(0) ) {
				throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
			}
			$sheet = $workbook->worksheet(0);
		} elsif( $opts->{'excel'} eq 'xlsx' ) {
			try {
				require Spreadsheet::XLSX;
			} catch {
				throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
			}
			$need_decode = 1;
			my $workbook;
			try {
				$workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
			} catch {
				throw( "Failed to parse file " . $opts->{'file'} );
			}
			$sheet = $workbook->worksheet(0);
		} else {
			throw( "Unrecognized Excel variant" . $opts->{'excel'} );
		}
		$alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
    } else {
    	# Assume it is a comma-, tab-, or whatever-separated format.
		my $csv_options = { 'binary' => 1 };
		$csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
		if( $csv_options->{'sep_char'} eq "\t" ) {
			# If it is really tab separated, nothing is an escape char.
			$csv_options->{'quote_char'} = undef;
			$csv_options->{'escape_char'} = undef;
		}
		my $csv = Text::CSV->new( $csv_options );
		
		if( exists $opts->{'string' } ) {
			my @lines = split( "\n", $opts->{'string'} );
			foreach my $l ( @lines ) {
				my $status = $csv->parse( $l );
				if( $status ) {
					push( @$alignment_table, [ $csv->fields ] );
				} else {
					throw( "Could not parse line $l: " . $csv->error_input );
				}
			}
		} elsif( exists $opts->{'file'} ) {
			open( my $fh, $opts->{'file'} ) 
				or warn "Could not open input file " . $opts->{'file'};
			binmode( $fh, ':utf8' );
			while( my $row = $csv->getline( $fh ) ) {
				push( @$alignment_table, $row );
			}
			close $fh;
		} else {
			throw( "Could not find string or file option to parse" );
		}
	}
	return $alignment_table;
}
sub _alignment_from_worksheet {
	my( $sheet, $decode ) = @_;
	my $alignment_table = [];
	
	my( $rmin, $rmax ) = $sheet->row_range();
	my( $cmin, $cmax ) = $sheet->col_range();
	unless( $cmax && $rmax ) {
		throw( "Found no rows or no columns in first worksheet" );
	}
	# Populate the alignment table. We only want columns that have
	# a sigil in row zero.
	my %sigcols = ();
	push( @$alignment_table, [] );
	foreach my $col ( $cmin .. $cmax ) {
		my $cell = $sheet->get_cell( $rmin, $col );
		my $cellval = $cell ? $cell->value() : undef;
		if( $cellval ) {
			$sigcols{$col} = 1;
			push( @{$alignment_table->[0]}, $cellval );
		}
	}
	# Now go through the rest of the rows and pick up the columns
	# that were headed by a sigil.
	foreach my $row ( $rmin+1 .. $rmax ) {
		my @tablerow;
		foreach my $col ( $cmin .. $cmax ) {
			next unless $sigcols{$col};
			my $cell = $sheet->get_cell( $row, $col );
			my $cellval;
			if( $cell ) {
				$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
			}
			push( @tablerow, $cellval );
		}
		push( @$alignment_table, \@tablerow );
	}
	return $alignment_table;
}

sub _make_nodes {
    my( $collation, $row, $index, $nocollate ) = @_;
    my %unique;
    my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
    foreach my $w ( @$row ) {
        $unique{$w} = 1 if $w;
        $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
    }
    my $ctr = 1;
    foreach my $w ( keys %unique ) {
    	my $rargs = {
    		'id' => "r$index.$ctr",
    		'rank' => $index,
    		'text' => $w,
    		};
    	if( $w eq '#LACUNA#' ) {
    		$rargs->{'is_lacuna'} = 1;
    	} elsif( $commonctr == 1 ) {
    		$rargs->{'is_common'} = 1;
    	}
        my $r = $collation->add_reading( $rargs );
        $unique{$w} = $r;
        $ctr++;
    }
    # Collate this sequence of readings via a single 'collation' relationship.
    unless( $nocollate ) {
		my @rankrdgs = values %unique;
		my $collation_rel;
		while( @rankrdgs ) {
			my $r = shift @rankrdgs;
			next if $r->is_meta;
			foreach my $nr ( @rankrdgs ) {
				next if $nr->is_meta;
				if( $collation_rel ) {
					$collation->add_relationship( $r, $nr, $collation_rel );
				} else {
					$collation->add_relationship( $r, $nr, 
						{ 'type' => 'collated', 
						  'annotation' => "Parsed together for rank $index" } );
					$collation_rel = $collation->get_relationship( $r, $nr );
				}
			}
		}
	}    
    return \%unique;
}

sub throw {
	Text::Tradition::Error->throw( 
		'ident' => 'Parser::Tabular error',
		'message' => $_[0],
		);
}

1;

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
d9e873d0	1	package Text::Tradition::Parser::Tabular;
	2
	3	use strict;
	4	use warnings;
3a3b8213	5	use Encode qw/ decode_utf8 /;
82fa4d57	6	use Text::CSV;
701ad2ba	7	use Text::Tradition::Error;
701ad2ba	8	use TryCatch;
d9e873d0	9
	10	=head1 NAME
	11
	12	Text::Tradition::Parser::Tabular
	13
3b853983	14	=head1 SYNOPSIS
	15
	16	use Text::Tradition;
	17
	18	my $t_from_file = Text::Tradition->new(
	19	'name' => 'my text',
	20	'input' => 'Tabular',
	21	'file' => '/path/to/collation.csv',
	22	'sep_char' => ','
	23	);
	24
	25	my $t_from_string = Text::Tradition->new(
	26	'name' => 'my text',
	27	'input' => 'Tabular',
	28	'string' => $tab_separated_collation,
	29	'sep_char' => "\t",
	30	);
	31
d9e873d0	32	=head1 DESCRIPTION
d9e873d0	33
a445ce40	34	Parser module for Text::Tradition to read an alignment table format, such as
a445ce40	35	CSV or Excel.
d9e873d0	36
	37	=head1 METHODS
	38
e867486f	39	=head2 B<parse>( $tradition, $option_hash )
3b853983	40
	41	Takes an initialized tradition and a set of options; creates the
	42	appropriate nodes and edges on the graph, as well as the appropriate
a445ce40	43	witness objects. The $option_hash can contain the following:
	44
	45	=over
	46
	47	=item * file - Name of file which contains the data
	48
	49	=item * string - A string that itself contains the data. One of 'file' or
	50	'string' is required.
	51
	52	=item * sep_char - For plaintext formats, the field separation character.
	53	Defaults to "\t" (tab-separated); should be "," for comma-separated format.
	54
	55	=item * excel - If the data is in an Excel file, this option should be set
	56	to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
	57
	58	=back
	59
	60	The data should comprise a table with witnesses arranged in columns, with
	61	the witness sigla in the first row. Empty cells are interpreted as
	62	omissions (and thus stemmatologically relevant.) Longer lacunae in the
	63	text, to be disregarded in cladistic analysis, may be represented by
	64	filling the appropriate cells with the tag '#LACUNA#'.
3b853983	65
	66	If a witness name ends in the collation's ac_label, it will be treated as
	67	an 'ante-correction' version of the 'main' witness whose sigil it shares.
	68
	69	=begin testing
	70
	71	use Text::Tradition;
	72	binmode STDOUT, ":utf8";
	73	binmode STDERR, ":utf8";
	74	eval { no warnings; binmode $DB::OUT, ":utf8"; };
	75
	76	my $csv = 't/data/florilegium.csv';
	77	my $t = Text::Tradition->new(
	78	'name' => 'inline',
	79	'input' => 'Tabular',
	80	'file' => $csv,
	81	'sep_char' => ',',
	82	);
d9e873d0	83
3b853983	84	is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0	85
3b853983	86	### TODO Check these figures
3b853983	87	if( $t ) {
0e47f4f6	88	is( scalar $t->collation->readings, 311, "Collation has all readings" );
0e47f4f6	89	is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983	90	is( scalar $t->witnesses, 13, "Collation has all witnesses" );
	91	}
	92
b0b4421a	93	# Check that we have the right witnesses
	94	my %seen_wits;
	95	map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
	96	foreach my $wit ( $t->witnesses ) {
	97	$seen_wits{$wit->sigil} = 1;
	98	}
	99	is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
	100	foreach my $k ( keys %seen_wits ) {
	101	ok( $seen_wits{$k}, "Witness $k still exists" );
	102	}
	103
	104	# Check that the witnesses have the right texts
	105	foreach my $wit ( $t->witnesses ) {
	106	my $origtext = join( ' ', @{$wit->text} );
	107	my $graphtext = $t->collation->path_text( $wit->sigil );
	108	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
	109	}
	110
	111	# Check that the a.c. witnesses have the right text
	112	map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
	113	foreach my $k ( keys %seen_wits ) {
	114	my $wit = $t->witness( $k );
	115	if( $seen_wits{$k} ) {
	116	ok( $wit->is_layered, "Witness $k got marked as layered" );
	117	ok( $wit->has_layertext, "Witness $k has an a.c. version" );
	118	my $origtext = join( ' ', @{$wit->layertext} );
	119	my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27	120	my $graphtext = $t->collation->path_text( $acsig );
b0b4421a	121	is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
	122	} else {
	123	ok( !$wit->is_layered, "Witness $k not marked as layered" );
	124	ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
	125	}
	126	}
	127
cc31ebaa	128	# Check that we only have collation relationships where we need them
cc31ebaa	129	is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa	130
701ad2ba	131	## Check excel parsing
	132
	133	my $xls = 't/data/armexample.xls';
	134	my $xt = Text::Tradition->new(
	135	'name' => 'excel test',
	136	'input' => 'Tabular',
	137	'file' => $xls,
3a3b8213	138	'excel' => 'xls'
701ad2ba	139	);
	140
	141	is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
	142	my %xls_wits;
	143	map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
	144	foreach my $wit ( $xt->witnesses ) {
	145	$xls_wits{$wit->sigil} = 1;
	146	}
	147	is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
	148	foreach my $k ( keys %xls_wits ) {
	149	ok( $xls_wits{$k}, "Witness $k still exists" );
	150	}
	151	is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
	152	is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
	153	is( $xt->collation->reading('r5.1')->text, "\x{587}",
	154	"Correct decoding of at least one reading" );
	155
3a3b8213	156	my $xlsx = 't/data/armexample.xlsx';
	157	my $xtx = Text::Tradition->new(
	158	'name' => 'excel test',
	159	'input' => 'Tabular',
	160	'file' => $xlsx,
	161	'excel' => 'xlsx'
	162	);
	163
	164	is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
	165	my %xlsx_wits;
	166	map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
	167	foreach my $wit ( $xtx->witnesses ) {
	168	$xlsx_wits{$wit->sigil} = 1;
	169	}
	170	is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
	171	foreach my $k ( keys %xlsx_wits ) {
	172	ok( $xlsx_wits{$k}, "Witness $k still exists" );
	173	}
	174	is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
	175	is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
	176	is( $xtx->collation->reading('r5.1')->text, "\x{587}",
	177	"Correct decoding of at least one reading" );
	178
3b853983	179	=end testing
d9e873d0	180
	181	=cut
	182
	183	sub parse {
dfc37e38	184	my( $tradition, $opts ) = @_;
3a3b8213	185	my $alignment_table = _table_from_input( $opts );
d9e873d0	186	# Set up the witnesses we find in the first line
d9e873d0	187	my @witnesses;
b0b4421a	188	my %ac_wits; # Track layered witness -> main witness mapping
3a3b8213	189	my $c = $tradition->collation; # shorthand
82fa4d57	190	my $aclabel = $c->ac_label;
d9e873d0	191	foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983	192	if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57	193	# Sanitize the sigil name to an XML name
82fa4d57	194	$sigil = $1 . '_layered';
b0b4421a	195	$ac_wits{$sigil} = $1;
3b853983	196	}
82fa4d57	197	my $wit = $tradition->add_witness(
	198	'sigil' => $sigil, 'sourcetype' => 'collation' );
	199	$wit->path( [ $c->start ] );
	200	push( @witnesses, $wit );
	201	my $aclabel = $c->ac_label;
d9e873d0	202	}
d9e873d0	203
b0b4421a	204	# Save the original witness text sequences. Have to loop back through
	205	# the witness columns after we have identified all the a.c. witnesses.
	206	foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
	207	my @sequence = map { $_->[$idx] } @{$alignment_table};
	208	my $sigil = shift @sequence;
	209	my $is_layer = exists( $ac_wits{$sigil} );
	210	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
	211	# Now get rid of gaps and meta-readings like #LACUNA#
	212	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
	213	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
	214	}
	215
9bdf9d67	216	my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
	217	print STDERR "Tradition too big for row collation\n" if $nocollate;
	218
d9e873d0	219	# Now for the next rows, make nodes as necessary, assign their ranks, and
d9e873d0	220	# add them to the witness paths.
d9e873d0	221	foreach my $idx ( 1 .. $#{$alignment_table} ) {
d9e873d0	222	my $row = $alignment_table->[$idx];
9bdf9d67	223	my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
d9e873d0	224	foreach my $w ( 0 .. $#{$row} ) {
	225	# push the appropriate node onto the appropriate witness path
	226	my $word = $row->[$w];
	227	if( $word ) {
	228	my $reading = $nodes->{$word};
	229	my $wit = $witnesses[$w];
	230	push( @{$wit->path}, $reading );
	231	} # else skip it for empty readings.
	232	}
	233	}
	234
eca16057	235	# Collapse our lacunae into a single node and
eca16057	236	# push the end node onto all paths.
d9e873d0	237	$c->end->rank( scalar @$alignment_table );
d9e873d0	238	foreach my $wit ( @witnesses ) {
eca16057	239	my $p = $wit->path;
	240	my $last_rdg = shift @$p;
	241	my $new_p = [ $last_rdg ];
	242	foreach my $rdg ( @$p ) {
83d5ac3a	243	# Omit the reading if we are in a lacuna already.
	244	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
	245	# Save the reading otherwise.
	246	push( @$new_p, $rdg );
	247	$last_rdg = $rdg;
eca16057	248	}
	249	push( @$new_p, $c->end );
	250	$wit->path( $new_p );
d9e873d0	251	}
d9e873d0	252
3b853983	253	# Fold any a.c. witnesses into their main witness objects, and
	254	# delete the independent a.c. versions.
	255	foreach my $a ( keys %ac_wits ) {
b0b4421a	256	my $ac_wit = $tradition->witness( $a );
b0b4421a	257	my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983	258	next unless $main_wit;
861c3e27	259	$main_wit->is_layered(1);
3b853983	260	$main_wit->uncorrected_path( $ac_wit->path );
	261	$tradition->del_witness( $ac_wit );
	262	}
83d5ac3a	263
d9e873d0	264	# Join up the paths.
d9e873d0	265	$c->make_witness_paths;
83d5ac3a	266	# Delete our unused lacuna nodes.
	267	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
	268	$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
	269	}
861c3e27	270
	271	# Do a consistency check.
	272	foreach my $wit ( $tradition->witnesses ) {
	273	my $pathtext = $c->path_text( $wit->sigil );
	274	my $origtext = join( ' ', @{$wit->text} );
	275	warn "Text differs for witness " . $wit->sigil
	276	unless $pathtext eq $origtext;
	277	if( $wit->is_layered ) {
	278	$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
	279	$origtext = join( ' ', @{$wit->layertext} );
	280	warn "Ante-corr text differs for witness " . $wit->sigil
	281	unless $pathtext eq $origtext;
	282	} else {
	283	warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
	284	if $wit->has_layertext;
	285	}
	286	}
202ccb18	287
	288	# Note that our ranks and common readings are set.
	289	$c->_graphcalc_done(1);
cc31ebaa	290	# Remove redundant collation relationships.
9bdf9d67	291	$c->relations->filter_collations() unless $nocollate;
d9e873d0	292	}
d9e873d0	293
3a3b8213	294	sub _table_from_input {
	295	my $opts = shift;
	296	my $alignment_table = [];
	297	if( $opts->{'excel'} ) {
	298	my $sheet;
	299	my $need_decode;
	300	unless( exists $opts->{'file'} ) {
	301	throw( "Must pass the filename for Excel parsing" );
	302	}
	303	if( $opts->{'excel'} eq 'xls' ) {
	304	try {
	305	require Spreadsheet::ParseExcel;
	306	} catch {
	307	throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
	308	}
	309	my $parser = Spreadsheet::ParseExcel->new();
	310	my $workbook = $parser->parse( $opts->{'file'} );
	311	unless( defined $workbook && defined $workbook->worksheet(0) ) {
	312	throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
	313	}
	314	$sheet = $workbook->worksheet(0);
	315	} elsif( $opts->{'excel'} eq 'xlsx' ) {
	316	try {
	317	require Spreadsheet::XLSX;
	318	} catch {
	319	throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
	320	}
	321	$need_decode = 1;
	322	my $workbook;
	323	try {
	324	$workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
	325	} catch {
	326	throw( "Failed to parse file " . $opts->{'file'} );
	327	}
	328	$sheet = $workbook->worksheet(0);
	329	} else {
	330	throw( "Unrecognized Excel variant" . $opts->{'excel'} );
	331	}
	332	$alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
	333	} else {
	334	# Assume it is a comma-, tab-, or whatever-separated format.
	335	my $csv_options = { 'binary' => 1 };
	336	$csv_options->{'sep_char'} = $opts->{'sep_char'} \|\| "\t";
	337	if( $csv_options->{'sep_char'} eq "\t" ) {
	338	# If it is really tab separated, nothing is an escape char.
	339	$csv_options->{'quote_char'} = undef;
	340	$csv_options->{'escape_char'} = undef;
	341	}
	342	my $csv = Text::CSV->new( $csv_options );
	343
	344	if( exists $opts->{'string' } ) {
	345	my @lines = split( "\n", $opts->{'string'} );
	346	foreach my $l ( @lines ) {
	347	my $status = $csv->parse( $l );
	348	if( $status ) {
	349	push( @$alignment_table, [ $csv->fields ] );
	350	} else {
	351	throw( "Could not parse line $l: " . $csv->error_input );
	352	}
	353	}
	354	} elsif( exists $opts->{'file'} ) {
	355	open( my $fh, $opts->{'file'} )
	356	or warn "Could not open input file " . $opts->{'file'};
	357	binmode( $fh, ':utf8' );
358	while( my $row = $csv->getline( $fh ) ) {
359	push( @$alignment_table, $row );
360	}
361	close $fh;
362	} else {
363	throw( "Could not find string or file option to parse" );
364	}
365	}
366	return $alignment_table;
367	}
368	sub _alignment_from_worksheet {
369	my( $sheet, $decode ) = @_;
370	my $alignment_table = [];
371
372	my( $rmin, $rmax ) = $sheet->row_range();
373	my( $cmin, $cmax ) = $sheet->col_range();
374	unless( $cmax && $rmax ) {
375	throw( "Found no rows or no columns in first worksheet" );
376	}
377	# Populate the alignment table. We only want columns that have
378	# a sigil in row zero.
379	my %sigcols = ();
380	push( @$alignment_table, [] );
381	foreach my $col ( $cmin .. $cmax ) {
382	my $cell = $sheet->get_cell( $rmin, $col );
383	my $cellval = $cell ? $cell->value() : undef;
384	if( $cellval ) {
385	$sigcols{$col} = 1;
386	push( @{$alignment_table->[0]}, $cellval );
387	}
388	}
389	# Now go through the rest of the rows and pick up the columns
390	# that were headed by a sigil.
391	foreach my $row ( $rmin+1 .. $rmax ) {
392	my @tablerow;
393	foreach my $col ( $cmin .. $cmax ) {
394	next unless $sigcols{$col};
395	my $cell = $sheet->get_cell( $row, $col );
396	my $cellval;
397	if( $cell ) {
398	$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
399	}
400	push( @tablerow, $cellval );
401	}
402	push( @$alignment_table, \@tablerow );
403	}
404	return $alignment_table;
405	}
406
027d819c	407	sub _make_nodes {
9bdf9d67	408	my( $collation, $row, $index, $nocollate ) = @_;
d9e873d0	409	my %unique;
15db7774	410	my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0	411	foreach my $w ( @$row ) {
d9e873d0	412	$unique{$w} = 1 if $w;
15db7774	413	$commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0	414	}
	415	my $ctr = 1;
	416	foreach my $w ( keys %unique ) {
a753cc84	417	my $rargs = {
10e4b1ac	418	'id' => "r$index.$ctr",
a753cc84	419	'rank' => $index,
	420	'text' => $w,
	421	};
15db7774	422	if( $w eq '#LACUNA#' ) {
	423	$rargs->{'is_lacuna'} = 1;
	424	} elsif( $commonctr == 1 ) {
	425	$rargs->{'is_common'} = 1;
	426	}
a753cc84	427	my $r = $collation->add_reading( $rargs );
d9e873d0	428	$unique{$w} = $r;
a753cc84	429	$ctr++;
d9e873d0	430	}
bf6e338d	431	# Collate this sequence of readings via a single 'collation' relationship.
9bdf9d67	432	unless( $nocollate ) {
	433	my @rankrdgs = values %unique;
	434	my $collation_rel;
	435	while( @rankrdgs ) {
	436	my $r = shift @rankrdgs;
	437	next if $r->is_meta;
	438	foreach my $nr ( @rankrdgs ) {
	439	next if $nr->is_meta;
	440	if( $collation_rel ) {
	441	$collation->add_relationship( $r, $nr, $collation_rel );
	442	} else {
	443	$collation->add_relationship( $r, $nr,
	444	{ 'type' => 'collated',
	445	'annotation' => "Parsed together for rank $index" } );
	446	$collation_rel = $collation->get_relationship( $r, $nr );
	447	}
	448	}
	449	}
	450	}
d9e873d0	451	return \%unique;
	452	}
	453
3a3b8213	454	sub throw {
	455	Text::Tradition::Error->throw(
	456	'ident' => 'Parser::Tabular error',
	457	'message' => $_[0],
	458	);
	459	}
	460
3b853983	461	1;
	462
	463	=head1 LICENSE
	464
	465	This package is free software and is provided "as is" without express
	466	or implied warranty. You can redistribute it and/or modify it under
	467	the same terms as Perl itself.
	468
	469	=head1 AUTHOR
	470
	471	Tara L Andrews E<lt>aurum@cpan.orgE<gt>