[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Tabular.pm

package Text::Tradition::Parser::Tabular;

use strict;
use warnings;
use Encode qw/ decode_utf8 /;
use Text::CSV;
use Text::Tradition::Error;
use TryCatch;

=head1 NAME

Text::Tradition::Parser::Tabular

=head1 SYNOPSIS

  use Text::Tradition;
  
  my $t_from_file = Text::Tradition->new( 
    'name' => 'my text',
    'input' => 'Tabular',
    'file' => '/path/to/collation.csv',
    'sep_char' => ','
    );
    
  my $t_from_string = Text::Tradition->new( 
    'name' => 'my text',
    'input' => 'Tabular',
    'string' => $tab_separated_collation,
    'sep_char' => "\t",
    );

=head1 DESCRIPTION

Parser module for Text::Tradition to read an alignment table format, such as 
CSV or Excel.

=head1 METHODS

=head2 B<parse>( $tradition, $option_hash )

Takes an initialized tradition and a set of options; creates the
appropriate nodes and edges on the graph, as well as the appropriate
witness objects.  The $option_hash can contain the following:

=over

=item * file - Name of file which contains the data

=item * string - A string that itself contains the data. One of 'file' or 
'string' is required.

=item * sep_char - For plaintext formats, the field separation character.
Defaults to "\t" (tab-separated); should be "," for comma-separated format.

=item * excel - If the data is in an Excel file, this option should be set
to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)

=back

The data should comprise a table with witnesses arranged in columns, with
the witness sigla in the first row.  Empty cells are interpreted as
omissions (and thus stemmatologically relevant.) Longer lacunae in the
text, to be disregarded in cladistic analysis, may be represented by
filling the appropriate cells with the tag '#LACUNA#'.

If a witness name ends in the collation's ac_label, it will be treated as
an 'ante-correction' version of the 'main' witness whose sigil it shares.

=begin testing

use Test::More::UTF8;
use Text::Tradition;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
eval { no warnings; binmode $DB::OUT, ":utf8"; };

my $csv = 't/data/florilegium.csv';
my $t = Text::Tradition->new( 
    'name'  => 'inline', 
    'input' => 'Tabular',
    'file'  => $csv,
    'sep_char' => ',',
    );

is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );

### TODO Check these figures
if( $t ) {
    is( scalar $t->collation->readings, 311, "Collation has all readings" );
    is( scalar $t->collation->paths, 361, "Collation has all paths" );
    is( scalar $t->witnesses, 13, "Collation has all witnesses" );
}

# Check that we have the right witnesses
my %seen_wits;
map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
foreach my $wit ( $t->witnesses ) {
	$seen_wits{$wit->sigil} = 1;
}
is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
foreach my $k ( keys %seen_wits ) {
	ok( $seen_wits{$k}, "Witness $k still exists" );
}

# Check that the witnesses have the right texts
foreach my $wit ( $t->witnesses ) {
	my $origtext = join( ' ', @{$wit->text} );
	my $graphtext = $t->collation->path_text( $wit->sigil );
	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
}

# Check that the a.c. witnesses have the right text
map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
foreach my $k ( keys %seen_wits ) {
	my $wit = $t->witness( $k );
	if( $seen_wits{$k} ) {
		ok( $wit->is_layered, "Witness $k got marked as layered" );
		ok( $wit->has_layertext, "Witness $k has an a.c. version" );
		my $origtext = join( ' ', @{$wit->layertext} );
		my $acsig = $wit->sigil . $t->collation->ac_label;
		my $graphtext = $t->collation->path_text( $acsig );
		is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
	} else {
		ok( !$wit->is_layered, "Witness $k not marked as layered" );
		ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
	}
}	

# Check that we only have collation relationships where we need them
is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );

## Check excel parsing

my $xls = 't/data/armexample.xls';
my $xt = Text::Tradition->new(
	'name'  => 'excel test',
	'input' => 'Tabular',
	'file'  => $xls,
	'excel'   => 'xls'
	);

is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
my %xls_wits;
map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
foreach my $wit ( $xt->witnesses ) {
	$xls_wits{$wit->sigil} = 1;
}
is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
foreach my $k ( keys %xls_wits ) {
	ok( $xls_wits{$k}, "Witness $k still exists" );
}
is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
is( $xt->collation->reading('r5.1')->text, "\x{587}", 
	"Correct decoding of at least one reading" );

my $xlsx = 't/data/armexample.xlsx';
my $xtx = Text::Tradition->new(
	'name'  => 'excel test',
	'input' => 'Tabular',
	'file'  => $xlsx,
	'excel'   => 'xlsx'
	);

is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
my %xlsx_wits;
map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit3 /;
$xlsx_wits{"\x{531}\x{562}2"} = 0;
foreach my $wit ( $xtx->witnesses ) {
	$xlsx_wits{$wit->sigil} = 1;
}
is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
foreach my $k ( keys %xlsx_wits ) {
	ok( $xlsx_wits{$k}, "Witness $k still exists" );
}
is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
is( $xtx->collation->reading('r5.1')->text, "\x{587}", 
	"Correct decoding of at least one reading" );

=end testing

=cut

sub parse {
    my( $tradition, $opts ) = @_;
    my $alignment_table = _table_from_input( $opts );
    # Set up the witnesses we find in the first line
    my @witnesses;
    my %ac_wits;  # Track layered witness -> main witness mapping
    my $c = $tradition->collation; # shorthand
    my $aclabel = $c->ac_label;
    foreach my $sigil ( @{$alignment_table->[0]} ) {
        if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
        	# Sanitize the sigil name to an XML name
        	$sigil = $1 . '_layered';
            $ac_wits{$sigil} = $1;
        }
        my $wit = $tradition->add_witness( 
        	'sigil' => $sigil, 'sourcetype' => 'collation' );
        $wit->path( [ $c->start ] );
        push( @witnesses, $wit );
        my $aclabel = $c->ac_label;
    }
    
    # Save the original witness text sequences. Have to loop back through
    # the witness columns after we have identified all the a.c. witnesses.
    foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
    	my @sequence = map { $_->[$idx] } @{$alignment_table};
    	my $sigil = shift @sequence;
    	my $is_layer = exists( $ac_wits{$sigil} );
    	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );	
    	# Now get rid of gaps and meta-readings like #LACUNA#
    	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
    	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
    }    
    
    my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
    print STDERR "Tradition too big for row collation\n" if $nocollate;
    
    # Now for the next rows, make nodes as necessary, assign their ranks, and 
    # add them to the witness paths.
    foreach my $idx ( 1 .. $#{$alignment_table} ) {
        my $row = $alignment_table->[$idx];
        my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
        foreach my $w ( 0 .. $#{$row} ) {
            # push the appropriate node onto the appropriate witness path
            my $word = $row->[$w];
            if( $word ) {
                my $reading = $nodes->{$word};
                my $wit = $witnesses[$w];
                push( @{$wit->path}, $reading );
            } # else skip it for empty readings.
        }
    }
    
    # Collapse our lacunae into a single node and
    # push the end node onto all paths.
    $c->end->rank( scalar @$alignment_table );
    foreach my $wit ( @witnesses ) {
        my $p = $wit->path;
        my $last_rdg = shift @$p;
        my $new_p = [ $last_rdg ];
        foreach my $rdg ( @$p ) {
        	# Omit the reading if we are in a lacuna already.
        	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
			# Save the reading otherwise.
			push( @$new_p, $rdg );
			$last_rdg = $rdg;
        }
        push( @$new_p, $c->end );
        $wit->path( $new_p );
    }
    
    # Fold any a.c. witnesses into their main witness objects, and
    # delete the independent a.c. versions.
    foreach my $a ( keys %ac_wits ) {
    	my $ac_wit = $tradition->witness( $a );
        my $main_wit = $tradition->witness( $ac_wits{$a} );
        next unless $main_wit;
        $main_wit->is_layered(1);
        $main_wit->uncorrected_path( $ac_wit->path );
        $tradition->del_witness( $ac_wit );
    }
    
    # Join up the paths.
    $c->make_witness_paths;
    # Delete our unused lacuna nodes.
	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
		$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
	}
	
	# Do a consistency check.
	foreach my $wit ( $tradition->witnesses ) {
		my $pathtext = $c->path_text( $wit->sigil );
		my $origtext = join( ' ', @{$wit->text} );
		warn "Text differs for witness " . $wit->sigil 
			unless $pathtext eq $origtext;
		if( $wit->is_layered ) {
			$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
			$origtext = join( ' ', @{$wit->layertext} );
			warn "Ante-corr text differs for witness " . $wit->sigil
				unless $pathtext eq $origtext;
		} else {
			warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
				if $wit->has_layertext;
		}
	}
	
	# Note that our ranks and common readings are set.
	$c->_graphcalc_done(1);
	_add_collations( $c ) unless $nocollate;
}

sub _table_from_input {
	my $opts = shift;
	my $alignment_table = [];
    if( $opts->{'excel'} ) {
    	my $sheet;
    	my $need_decode;
		unless( exists $opts->{'file'} ) {
			throw( "Must pass the filename for Excel parsing" );
		}
    	if( $opts->{'excel'} eq 'xls' ) {
			try {
				require Spreadsheet::ParseExcel;
			} catch {
				throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
			}
			my $parser = Spreadsheet::ParseExcel->new();
			my $workbook = $parser->parse( $opts->{'file'} );
			unless( defined $workbook && defined $workbook->worksheet(0) ) {
				throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
			}
			$sheet = $workbook->worksheet(0);
		} elsif( $opts->{'excel'} eq 'xlsx' ) {
			try {
				require Spreadsheet::XLSX;
			} catch {
				throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
			}
			$need_decode = 1;
			my $workbook;
			try {
				$workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
			} catch {
				throw( "Failed to parse file " . $opts->{'file'} );
			}
			$sheet = $workbook->worksheet(0);
		} else {
			throw( "Unrecognized Excel variant" . $opts->{'excel'} );
		}
		$alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
    } else {
    	# Assume it is a comma-, tab-, or whatever-separated format.
		my $csv_options = { 'binary' => 1 };
		$csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
		if( $csv_options->{'sep_char'} eq "\t" ) {
			# If it is really tab separated, nothing is an escape char.
			$csv_options->{'quote_char'} = undef;
			$csv_options->{'escape_char'} = undef;
		}
		my $csv = Text::CSV->new( $csv_options );
		
		if( exists $opts->{'string' } ) {
			my @lines = split( "\n", $opts->{'string'} );
			foreach my $l ( @lines ) {
				my $status = $csv->parse( $l );
				if( $status ) {
					push( @$alignment_table, [ $csv->fields ] );
				} else {
					throw( "Could not parse line $l: " . $csv->error_input );
				}
			}
		} elsif( exists $opts->{'file'} ) {
			open( my $fh, $opts->{'file'} ) 
				or warn "Could not open input file " . $opts->{'file'};
			binmode( $fh, ':utf8' );
			while( my $row = $csv->getline( $fh ) ) {
				push( @$alignment_table, $row );
			}
			close $fh;
		} else {
			throw( "Could not find string or file option to parse" );
		}
	}
	return $alignment_table;
}
sub _alignment_from_worksheet {
	my( $sheet, $decode ) = @_;
	my $alignment_table = [];
	
	my( $rmin, $rmax ) = $sheet->row_range();
	my( $cmin, $cmax ) = $sheet->col_range();
	unless( $cmax && $rmax ) {
		throw( "Found no rows or no columns in first worksheet" );
	}
	# Populate the alignment table. We only want columns that have
	# a sigil in row zero.
	my %sigcols = ();
	push( @$alignment_table, [] );
	foreach my $col ( $cmin .. $cmax ) {
		my $cell = $sheet->get_cell( $rmin, $col );
		my $cellval;
		if( $cell ) {
			$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
		}
		if( $cellval ) {
			$sigcols{$col} = 1;
			push( @{$alignment_table->[0]}, $cellval );
		}
	}
	# Now go through the rest of the rows and pick up the columns
	# that were headed by a sigil.
	foreach my $row ( $rmin+1 .. $rmax ) {
		my @tablerow;
		foreach my $col ( $cmin .. $cmax ) {
			next unless $sigcols{$col};
			my $cell = $sheet->get_cell( $row, $col );
			my $cellval;
			if( $cell ) {
				$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
			}
			push( @tablerow, $cellval );
		}
		push( @$alignment_table, \@tablerow );
	}
	return $alignment_table;
}

sub _make_nodes {
    my( $collation, $row, $index, $nocollate ) = @_;
    my %unique;
    my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
    foreach my $w ( @$row ) {
        $unique{$w} = 1 if $w;
        $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
    }
    my $ctr = 1;
    foreach my $w ( keys %unique ) {
    	my $rargs = {
    		'id' => "r$index.$ctr",
    		'rank' => $index,
    		'text' => $w,
    		};
    	if( $w eq '#LACUNA#' ) {
    		$rargs->{'is_lacuna'} = 1;
    	} elsif( $commonctr == 1 ) {
    		$rargs->{'is_common'} = 1;
    	}
        my $r = $collation->add_reading( $rargs );
        $unique{$w} = $r;
        $ctr++;
    }
    return \%unique;
}

sub _add_collations {
	my( $collation ) = shift;
	# For each reading that needs to be held in place, add a 'collated' 
	# relationship to whatever anchor we can find. An anchor is a reading
	# that would occupy its rank by virtue of being subsequent to a
	# reading at $rank-1.
	my @collate_pairs;
	foreach my $r ( 1 .. $collation->end->rank - 1 ) {
		my $anchor;
		my @need_weak;
		my @here = grep { !$_->is_meta } $collation->readings_at_rank( $r );
		next unless @here > 1;
		foreach my $rdg ( @here ) {
			my $ip = 0;
			foreach my $pred ( $rdg->predecessors ) {
				if( $pred->rank == $r - 1 ) {
					$ip = 1;
					$anchor = $rdg unless( $anchor );
					last;
				}
			}
			push( @need_weak, $rdg ) unless $ip;
		}
		$anchor
			? map { push( @collate_pairs, [ $r, $anchor, $_ ] ) } @need_weak
			: print STDERR "No anchor found at $r\n";
	}
	foreach my $p ( @collate_pairs ) {
		my $r = shift @$p;
		$collation->add_relationship( @$p, 
			{ 'type' => 'collated', 
			  'annotation' => "Collated together for rank $r" } )
			unless $collation->get_relationship( @$p )
	}
}

sub throw {
	Text::Tradition::Error->throw( 
		'ident' => 'Parser::Tabular error',
		'message' => $_[0],
		);
}

1;

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
d9e873d0	1	package Text::Tradition::Parser::Tabular;
	2
	3	use strict;
	4	use warnings;
3a3b8213	5	use Encode qw/ decode_utf8 /;
82fa4d57	6	use Text::CSV;
701ad2ba	7	use Text::Tradition::Error;
701ad2ba	8	use TryCatch;
d9e873d0	9
	10	=head1 NAME
	11
	12	Text::Tradition::Parser::Tabular
	13
3b853983	14	=head1 SYNOPSIS
	15
	16	use Text::Tradition;
	17
	18	my $t_from_file = Text::Tradition->new(
	19	'name' => 'my text',
	20	'input' => 'Tabular',
	21	'file' => '/path/to/collation.csv',
	22	'sep_char' => ','
	23	);
	24
	25	my $t_from_string = Text::Tradition->new(
	26	'name' => 'my text',
	27	'input' => 'Tabular',
	28	'string' => $tab_separated_collation,
	29	'sep_char' => "\t",
	30	);
	31
d9e873d0	32	=head1 DESCRIPTION
d9e873d0	33
a445ce40	34	Parser module for Text::Tradition to read an alignment table format, such as
a445ce40	35	CSV or Excel.
d9e873d0	36
	37	=head1 METHODS
	38
e867486f	39	=head2 B<parse>( $tradition, $option_hash )
3b853983	40
	41	Takes an initialized tradition and a set of options; creates the
	42	appropriate nodes and edges on the graph, as well as the appropriate
a445ce40	43	witness objects. The $option_hash can contain the following:
	44
	45	=over
	46
	47	=item * file - Name of file which contains the data
	48
	49	=item * string - A string that itself contains the data. One of 'file' or
	50	'string' is required.
	51
	52	=item * sep_char - For plaintext formats, the field separation character.
	53	Defaults to "\t" (tab-separated); should be "," for comma-separated format.
	54
	55	=item * excel - If the data is in an Excel file, this option should be set
	56	to 'xls' (for pre-2007 Excel format) or 'xlsx' (for Excel XML format.)
	57
	58	=back
	59
	60	The data should comprise a table with witnesses arranged in columns, with
	61	the witness sigla in the first row. Empty cells are interpreted as
	62	omissions (and thus stemmatologically relevant.) Longer lacunae in the
	63	text, to be disregarded in cladistic analysis, may be represented by
	64	filling the appropriate cells with the tag '#LACUNA#'.
3b853983	65
	66	If a witness name ends in the collation's ac_label, it will be treated as
	67	an 'ante-correction' version of the 'main' witness whose sigil it shares.
	68
	69	=begin testing
	70
fa6bc75d	71	use Test::More::UTF8;
3b853983	72	use Text::Tradition;
	73	binmode STDOUT, ":utf8";
	74	binmode STDERR, ":utf8";
	75	eval { no warnings; binmode $DB::OUT, ":utf8"; };
	76
	77	my $csv = 't/data/florilegium.csv';
	78	my $t = Text::Tradition->new(
	79	'name' => 'inline',
	80	'input' => 'Tabular',
	81	'file' => $csv,
	82	'sep_char' => ',',
	83	);
d9e873d0	84
3b853983	85	is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0	86
3b853983	87	### TODO Check these figures
3b853983	88	if( $t ) {
0e47f4f6	89	is( scalar $t->collation->readings, 311, "Collation has all readings" );
0e47f4f6	90	is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983	91	is( scalar $t->witnesses, 13, "Collation has all witnesses" );
	92	}
	93
b0b4421a	94	# Check that we have the right witnesses
	95	my %seen_wits;
	96	map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
	97	foreach my $wit ( $t->witnesses ) {
	98	$seen_wits{$wit->sigil} = 1;
	99	}
	100	is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
	101	foreach my $k ( keys %seen_wits ) {
	102	ok( $seen_wits{$k}, "Witness $k still exists" );
	103	}
	104
	105	# Check that the witnesses have the right texts
	106	foreach my $wit ( $t->witnesses ) {
	107	my $origtext = join( ' ', @{$wit->text} );
	108	my $graphtext = $t->collation->path_text( $wit->sigil );
	109	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
	110	}
	111
	112	# Check that the a.c. witnesses have the right text
	113	map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
	114	foreach my $k ( keys %seen_wits ) {
	115	my $wit = $t->witness( $k );
	116	if( $seen_wits{$k} ) {
	117	ok( $wit->is_layered, "Witness $k got marked as layered" );
	118	ok( $wit->has_layertext, "Witness $k has an a.c. version" );
	119	my $origtext = join( ' ', @{$wit->layertext} );
	120	my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27	121	my $graphtext = $t->collation->path_text( $acsig );
b0b4421a	122	is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
	123	} else {
	124	ok( !$wit->is_layered, "Witness $k not marked as layered" );
	125	ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
	126	}
	127	}
	128
cc31ebaa	129	# Check that we only have collation relationships where we need them
cc31ebaa	130	is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa	131
701ad2ba	132	## Check excel parsing
	133
	134	my $xls = 't/data/armexample.xls';
	135	my $xt = Text::Tradition->new(
	136	'name' => 'excel test',
	137	'input' => 'Tabular',
	138	'file' => $xls,
3a3b8213	139	'excel' => 'xls'
701ad2ba	140	);
	141
	142	is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
	143	my %xls_wits;
	144	map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
	145	foreach my $wit ( $xt->witnesses ) {
	146	$xls_wits{$wit->sigil} = 1;
	147	}
	148	is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
	149	foreach my $k ( keys %xls_wits ) {
	150	ok( $xls_wits{$k}, "Witness $k still exists" );
	151	}
	152	is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
	153	is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
	154	is( $xt->collation->reading('r5.1')->text, "\x{587}",
	155	"Correct decoding of at least one reading" );
	156
3a3b8213	157	my $xlsx = 't/data/armexample.xlsx';
	158	my $xtx = Text::Tradition->new(
	159	'name' => 'excel test',
	160	'input' => 'Tabular',
	161	'file' => $xlsx,
	162	'excel' => 'xlsx'
	163	);
	164
	165	is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
	166	my %xlsx_wits;
fa6bc75d	167	map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit3 /;
fa6bc75d	168	$xlsx_wits{"\x{531}\x{562}2"} = 0;
3a3b8213	169	foreach my $wit ( $xtx->witnesses ) {
	170	$xlsx_wits{$wit->sigil} = 1;
	171	}
	172	is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
	173	foreach my $k ( keys %xlsx_wits ) {
	174	ok( $xlsx_wits{$k}, "Witness $k still exists" );
	175	}
	176	is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
	177	is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
	178	is( $xtx->collation->reading('r5.1')->text, "\x{587}",
	179	"Correct decoding of at least one reading" );
	180
3b853983	181	=end testing
d9e873d0	182
	183	=cut
	184
	185	sub parse {
dfc37e38	186	my( $tradition, $opts ) = @_;
3a3b8213	187	my $alignment_table = _table_from_input( $opts );
d9e873d0	188	# Set up the witnesses we find in the first line
d9e873d0	189	my @witnesses;
b0b4421a	190	my %ac_wits; # Track layered witness -> main witness mapping
3a3b8213	191	my $c = $tradition->collation; # shorthand
82fa4d57	192	my $aclabel = $c->ac_label;
d9e873d0	193	foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983	194	if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57	195	# Sanitize the sigil name to an XML name
82fa4d57	196	$sigil = $1 . '_layered';
b0b4421a	197	$ac_wits{$sigil} = $1;
3b853983	198	}
82fa4d57	199	my $wit = $tradition->add_witness(
	200	'sigil' => $sigil, 'sourcetype' => 'collation' );
	201	$wit->path( [ $c->start ] );
	202	push( @witnesses, $wit );
	203	my $aclabel = $c->ac_label;
d9e873d0	204	}
d9e873d0	205
b0b4421a	206	# Save the original witness text sequences. Have to loop back through
	207	# the witness columns after we have identified all the a.c. witnesses.
	208	foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
	209	my @sequence = map { $_->[$idx] } @{$alignment_table};
	210	my $sigil = shift @sequence;
	211	my $is_layer = exists( $ac_wits{$sigil} );
	212	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
	213	# Now get rid of gaps and meta-readings like #LACUNA#
	214	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
	215	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
	216	}
	217
9bdf9d67	218	my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
	219	print STDERR "Tradition too big for row collation\n" if $nocollate;
	220
d9e873d0	221	# Now for the next rows, make nodes as necessary, assign their ranks, and
d9e873d0	222	# add them to the witness paths.
d9e873d0	223	foreach my $idx ( 1 .. $#{$alignment_table} ) {
d9e873d0	224	my $row = $alignment_table->[$idx];
9bdf9d67	225	my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
d9e873d0	226	foreach my $w ( 0 .. $#{$row} ) {
	227	# push the appropriate node onto the appropriate witness path
	228	my $word = $row->[$w];
	229	if( $word ) {
	230	my $reading = $nodes->{$word};
	231	my $wit = $witnesses[$w];
	232	push( @{$wit->path}, $reading );
	233	} # else skip it for empty readings.
	234	}
	235	}
	236
eca16057	237	# Collapse our lacunae into a single node and
eca16057	238	# push the end node onto all paths.
d9e873d0	239	$c->end->rank( scalar @$alignment_table );
d9e873d0	240	foreach my $wit ( @witnesses ) {
eca16057	241	my $p = $wit->path;
	242	my $last_rdg = shift @$p;
	243	my $new_p = [ $last_rdg ];
	244	foreach my $rdg ( @$p ) {
83d5ac3a	245	# Omit the reading if we are in a lacuna already.
	246	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
	247	# Save the reading otherwise.
	248	push( @$new_p, $rdg );
	249	$last_rdg = $rdg;
eca16057	250	}
	251	push( @$new_p, $c->end );
	252	$wit->path( $new_p );
d9e873d0	253	}
d9e873d0	254
3b853983	255	# Fold any a.c. witnesses into their main witness objects, and
	256	# delete the independent a.c. versions.
	257	foreach my $a ( keys %ac_wits ) {
b0b4421a	258	my $ac_wit = $tradition->witness( $a );
b0b4421a	259	my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983	260	next unless $main_wit;
861c3e27	261	$main_wit->is_layered(1);
3b853983	262	$main_wit->uncorrected_path( $ac_wit->path );
	263	$tradition->del_witness( $ac_wit );
	264	}
83d5ac3a	265
d9e873d0	266	# Join up the paths.
d9e873d0	267	$c->make_witness_paths;
83d5ac3a	268	# Delete our unused lacuna nodes.
	269	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
	270	$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
	271	}
861c3e27	272
	273	# Do a consistency check.
	274	foreach my $wit ( $tradition->witnesses ) {
	275	my $pathtext = $c->path_text( $wit->sigil );
	276	my $origtext = join( ' ', @{$wit->text} );
	277	warn "Text differs for witness " . $wit->sigil
	278	unless $pathtext eq $origtext;
	279	if( $wit->is_layered ) {
	280	$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
	281	$origtext = join( ' ', @{$wit->layertext} );
	282	warn "Ante-corr text differs for witness " . $wit->sigil
	283	unless $pathtext eq $origtext;
	284	} else {
	285	warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
	286	if $wit->has_layertext;
	287	}
	288	}
202ccb18	289
	290	# Note that our ranks and common readings are set.
	291	$c->_graphcalc_done(1);
98a66507	292	_add_collations( $c ) unless $nocollate;
d9e873d0	293	}
d9e873d0	294
3a3b8213	295	sub _table_from_input {
	296	my $opts = shift;
	297	my $alignment_table = [];
	298	if( $opts->{'excel'} ) {
	299	my $sheet;
	300	my $need_decode;
	301	unless( exists $opts->{'file'} ) {
	302	throw( "Must pass the filename for Excel parsing" );
	303	}
	304	if( $opts->{'excel'} eq 'xls' ) {
	305	try {
	306	require Spreadsheet::ParseExcel;
	307	} catch {
	308	throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
	309	}
	310	my $parser = Spreadsheet::ParseExcel->new();
	311	my $workbook = $parser->parse( $opts->{'file'} );
	312	unless( defined $workbook && defined $workbook->worksheet(0) ) {
	313	throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
	314	}
	315	$sheet = $workbook->worksheet(0);
	316	} elsif( $opts->{'excel'} eq 'xlsx' ) {
	317	try {
	318	require Spreadsheet::XLSX;
	319	} catch {
	320	throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
	321	}
	322	$need_decode = 1;
	323	my $workbook;
	324	try {
	325	$workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
	326	} catch {
	327	throw( "Failed to parse file " . $opts->{'file'} );
	328	}
	329	$sheet = $workbook->worksheet(0);
	330	} else {
	331	throw( "Unrecognized Excel variant" . $opts->{'excel'} );
	332	}
	333	$alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
	334	} else {
	335	# Assume it is a comma-, tab-, or whatever-separated format.
	336	my $csv_options = { 'binary' => 1 };
	337	$csv_options->{'sep_char'} = $opts->{'sep_char'} \|\| "\t";
	338	if( $csv_options->{'sep_char'} eq "\t" ) {
	339	# If it is really tab separated, nothing is an escape char.
	340	$csv_options->{'quote_char'} = undef;
	341	$csv_options->{'escape_char'} = undef;
	342	}
	343	my $csv = Text::CSV->new( $csv_options );
	344
	345	if( exists $opts->{'string' } ) {
	346	my @lines = split( "\n", $opts->{'string'} );
	347	foreach my $l ( @lines ) {
	348	my $status = $csv->parse( $l );
	349	if( $status ) {
	350	push( @$alignment_table, [ $csv->fields ] );
	351	} else {
	352	throw( "Could not parse line $l: " . $csv->error_input );
	353	}
	354	}
	355	} elsif( exists $opts->{'file'} ) {
	356	open( my $fh, $opts->{'file'} )
	357	or warn "Could not open input file " . $opts->{'file'};
	358	binmode( $fh, ':utf8' );
359	while( my $row = $csv->getline( $fh ) ) {
360	push( @$alignment_table, $row );
361	}
362	close $fh;
363	} else {
364	throw( "Could not find string or file option to parse" );
365	}
366	}
367	return $alignment_table;
368	}
369	sub _alignment_from_worksheet {
370	my( $sheet, $decode ) = @_;
371	my $alignment_table = [];
372
373	my( $rmin, $rmax ) = $sheet->row_range();
374	my( $cmin, $cmax ) = $sheet->col_range();
375	unless( $cmax && $rmax ) {
376	throw( "Found no rows or no columns in first worksheet" );
377	}
378	# Populate the alignment table. We only want columns that have
379	# a sigil in row zero.
380	my %sigcols = ();
381	push( @$alignment_table, [] );
382	foreach my $col ( $cmin .. $cmax ) {
383	my $cell = $sheet->get_cell( $rmin, $col );
fa6bc75d	384	my $cellval;
	385	if( $cell ) {
	386	$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
	387	}
3a3b8213	388	if( $cellval ) {
	389	$sigcols{$col} = 1;
	390	push( @{$alignment_table->[0]}, $cellval );
	391	}
	392	}
	393	# Now go through the rest of the rows and pick up the columns
	394	# that were headed by a sigil.
	395	foreach my $row ( $rmin+1 .. $rmax ) {
	396	my @tablerow;
	397	foreach my $col ( $cmin .. $cmax ) {
	398	next unless $sigcols{$col};
	399	my $cell = $sheet->get_cell( $row, $col );
	400	my $cellval;
	401	if( $cell ) {
	402	$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
	403	}
	404	push( @tablerow, $cellval );
	405	}
	406	push( @$alignment_table, \@tablerow );
	407	}
	408	return $alignment_table;
	409	}
	410
027d819c	411	sub _make_nodes {
9bdf9d67	412	my( $collation, $row, $index, $nocollate ) = @_;
d9e873d0	413	my %unique;
15db7774	414	my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0	415	foreach my $w ( @$row ) {
d9e873d0	416	$unique{$w} = 1 if $w;
15db7774	417	$commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0	418	}
	419	my $ctr = 1;
	420	foreach my $w ( keys %unique ) {
a753cc84	421	my $rargs = {
10e4b1ac	422	'id' => "r$index.$ctr",
a753cc84	423	'rank' => $index,
	424	'text' => $w,
	425	};
15db7774	426	if( $w eq '#LACUNA#' ) {
	427	$rargs->{'is_lacuna'} = 1;
	428	} elsif( $commonctr == 1 ) {
	429	$rargs->{'is_common'} = 1;
	430	}
a753cc84	431	my $r = $collation->add_reading( $rargs );
d9e873d0	432	$unique{$w} = $r;
a753cc84	433	$ctr++;
d9e873d0	434	}
98a66507	435	return \%unique;
	436	}
	437
	438	sub _add_collations {
	439	my( $collation ) = shift;
	440	# For each reading that needs to be held in place, add a 'collated'
	441	# relationship to whatever anchor we can find. An anchor is a reading
	442	# that would occupy its rank by virtue of being subsequent to a
	443	# reading at $rank-1.
	444	my @collate_pairs;
	445	foreach my $r ( 1 .. $collation->end->rank - 1 ) {
98a66507	446	my $anchor;
	447	my @need_weak;
	448	my @here = grep { !$_->is_meta } $collation->readings_at_rank( $r );
	449	next unless @here > 1;
	450	foreach my $rdg ( @here ) {
	451	my $ip = 0;
	452	foreach my $pred ( $rdg->predecessors ) {
	453	if( $pred->rank == $r - 1 ) {
	454	$ip = 1;
	455	$anchor = $rdg unless( $anchor );
	456	last;
9bdf9d67	457	}
9bdf9d67	458	}
98a66507	459	push( @need_weak, $rdg ) unless $ip;
9bdf9d67	460	}
98a66507	461	$anchor
	462	? map { push( @collate_pairs, [ $r, $anchor, $_ ] ) } @need_weak
	463	: print STDERR "No anchor found at $r\n";
	464	}
	465	foreach my $p ( @collate_pairs ) {
	466	my $r = shift @$p;
	467	$collation->add_relationship( @$p,
	468	{ 'type' => 'collated',
	469	'annotation' => "Collated together for rank $r" } )
	470	unless $collation->get_relationship( @$p )
	471	}
d9e873d0	472	}
d9e873d0	473
3a3b8213	474	sub throw {
	475	Text::Tradition::Error->throw(
	476	'ident' => 'Parser::Tabular error',
	477	'message' => $_[0],
	478	);
	479	}
	480
3b853983	481	1;
	482
	483	=head1 LICENSE
	484
	485	This package is free software and is provided "as is" without express
	486	or implied warranty. You can redistribute it and/or modify it under
	487	the same terms as Perl itself.
	488
	489	=head1 AUTHOR
	490
	491	Tara L Andrews E<lt>aurum@cpan.orgE<gt>