[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Tabular.pm

package Text::Tradition::Parser::Tabular;

use strict;
use warnings;
use Encode qw/ decode_utf8 /;
use Text::CSV;
use Text::Tradition::Error;
use TryCatch;

=head1 NAME

Text::Tradition::Parser::Tabular

=head1 SYNOPSIS

  use Text::Tradition;
  
  my $t_from_file = Text::Tradition->new( 
    'name' => 'my text',
    'input' => 'Tabular',
    'file' => '/path/to/collation.csv',
    'sep_char' => ','
    );
    
  my $t_from_string = Text::Tradition->new( 
    'name' => 'my text',
    'input' => 'Tabular',
    'string' => $tab_separated_collation,
    'sep_char' => "\t",
    );

=head1 DESCRIPTION

Parser module for Text::Tradition to read an alignment table format, such as CSV.

=head1 METHODS

=head2 B<parse>( $tradition, $option_hash )

Takes an initialized tradition and a set of options; creates the
appropriate nodes and edges on the graph, as well as the appropriate
witness objects.  The $option_hash must contain either a 'file' or a
'string' argument with the table to be parsed; it may also contain a 
'sep_char' argument to specify how the fields are separated.

The table should have witnesses arranged in columns, with the witness sigla
in the first row.  Empty cells are interpreted as omissions (and thus
stemmatologically relevant.) Longer lacunae in the text, to be disregarded
in cladistic analysis, may be represented by filling the appropriate cells
with the tag '#LACUNA#'.

If a witness name ends in the collation's ac_label, it will be treated as
an 'ante-correction' version of the 'main' witness whose sigil it shares.

=begin testing

use Text::Tradition;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
eval { no warnings; binmode $DB::OUT, ":utf8"; };

my $csv = 't/data/florilegium.csv';
my $t = Text::Tradition->new( 
    'name'  => 'inline', 
    'input' => 'Tabular',
    'file'  => $csv,
    'sep_char' => ',',
    );

is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );

### TODO Check these figures
if( $t ) {
    is( scalar $t->collation->readings, 311, "Collation has all readings" );
    is( scalar $t->collation->paths, 361, "Collation has all paths" );
    is( scalar $t->witnesses, 13, "Collation has all witnesses" );
}

# Check that we have the right witnesses
my %seen_wits;
map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
foreach my $wit ( $t->witnesses ) {
	$seen_wits{$wit->sigil} = 1;
}
is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
foreach my $k ( keys %seen_wits ) {
	ok( $seen_wits{$k}, "Witness $k still exists" );
}

# Check that the witnesses have the right texts
foreach my $wit ( $t->witnesses ) {
	my $origtext = join( ' ', @{$wit->text} );
	my $graphtext = $t->collation->path_text( $wit->sigil );
	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
}

# Check that the a.c. witnesses have the right text
map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
foreach my $k ( keys %seen_wits ) {
	my $wit = $t->witness( $k );
	if( $seen_wits{$k} ) {
		ok( $wit->is_layered, "Witness $k got marked as layered" );
		ok( $wit->has_layertext, "Witness $k has an a.c. version" );
		my $origtext = join( ' ', @{$wit->layertext} );
		my $acsig = $wit->sigil . $t->collation->ac_label;
		my $graphtext = $t->collation->path_text( $acsig );
		is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
	} else {
		ok( !$wit->is_layered, "Witness $k not marked as layered" );
		ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
	}
}	

# Check that we only have collation relationships where we need them
is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );

## Check excel parsing

my $xls = 't/data/armexample.xls';
my $xt = Text::Tradition->new(
	'name'  => 'excel test',
	'input' => 'Tabular',
	'file'  => $xls,
	'excel'   => 'xls'
	);

is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
my %xls_wits;
map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
foreach my $wit ( $xt->witnesses ) {
	$xls_wits{$wit->sigil} = 1;
}
is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
foreach my $k ( keys %xls_wits ) {
	ok( $xls_wits{$k}, "Witness $k still exists" );
}
is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
is( $xt->collation->reading('r5.1')->text, "\x{587}", 
	"Correct decoding of at least one reading" );

my $xlsx = 't/data/armexample.xlsx';
my $xtx = Text::Tradition->new(
	'name'  => 'excel test',
	'input' => 'Tabular',
	'file'  => $xlsx,
	'excel'   => 'xlsx'
	);

is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
my %xlsx_wits;
map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
foreach my $wit ( $xtx->witnesses ) {
	$xlsx_wits{$wit->sigil} = 1;
}
is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
foreach my $k ( keys %xlsx_wits ) {
	ok( $xlsx_wits{$k}, "Witness $k still exists" );
}
is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
is( $xtx->collation->reading('r5.1')->text, "\x{587}", 
	"Correct decoding of at least one reading" );

=end testing

=cut

sub parse {
    my( $tradition, $opts ) = @_;
    my $alignment_table = _table_from_input( $opts );
    # Set up the witnesses we find in the first line
    my @witnesses;
    my %ac_wits;  # Track layered witness -> main witness mapping
    my $c = $tradition->collation; # shorthand
    my $aclabel = $c->ac_label;
    foreach my $sigil ( @{$alignment_table->[0]} ) {
        if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
        	# Sanitize the sigil name to an XML name
        	$sigil = $1 . '_layered';
            $ac_wits{$sigil} = $1;
        }
        my $wit = $tradition->add_witness( 
        	'sigil' => $sigil, 'sourcetype' => 'collation' );
        $wit->path( [ $c->start ] );
        push( @witnesses, $wit );
        my $aclabel = $c->ac_label;
    }
    
    # Save the original witness text sequences. Have to loop back through
    # the witness columns after we have identified all the a.c. witnesses.
    foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
    	my @sequence = map { $_->[$idx] } @{$alignment_table};
    	my $sigil = shift @sequence;
    	my $is_layer = exists( $ac_wits{$sigil} );
    	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );	
    	# Now get rid of gaps and meta-readings like #LACUNA#
    	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
    	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
    }    
    
    my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
    print STDERR "Tradition too big for row collation\n" if $nocollate;
    
    # Now for the next rows, make nodes as necessary, assign their ranks, and 
    # add them to the witness paths.
    foreach my $idx ( 1 .. $#{$alignment_table} ) {
        my $row = $alignment_table->[$idx];
        my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
        foreach my $w ( 0 .. $#{$row} ) {
            # push the appropriate node onto the appropriate witness path
            my $word = $row->[$w];
            if( $word ) {
                my $reading = $nodes->{$word};
                my $wit = $witnesses[$w];
                push( @{$wit->path}, $reading );
            } # else skip it for empty readings.
        }
    }
    
    # Collapse our lacunae into a single node and
    # push the end node onto all paths.
    $c->end->rank( scalar @$alignment_table );
    foreach my $wit ( @witnesses ) {
        my $p = $wit->path;
        my $last_rdg = shift @$p;
        my $new_p = [ $last_rdg ];
        foreach my $rdg ( @$p ) {
        	# Omit the reading if we are in a lacuna already.
        	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
			# Save the reading otherwise.
			push( @$new_p, $rdg );
			$last_rdg = $rdg;
        }
        push( @$new_p, $c->end );
        $wit->path( $new_p );
    }
    
    # Fold any a.c. witnesses into their main witness objects, and
    # delete the independent a.c. versions.
    foreach my $a ( keys %ac_wits ) {
    	my $ac_wit = $tradition->witness( $a );
        my $main_wit = $tradition->witness( $ac_wits{$a} );
        next unless $main_wit;
        $main_wit->is_layered(1);
        $main_wit->uncorrected_path( $ac_wit->path );
        $tradition->del_witness( $ac_wit );
    }
    
    # Join up the paths.
    $c->make_witness_paths;
    # Delete our unused lacuna nodes.
	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
		$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
	}
	
	# Do a consistency check.
	foreach my $wit ( $tradition->witnesses ) {
		my $pathtext = $c->path_text( $wit->sigil );
		my $origtext = join( ' ', @{$wit->text} );
		warn "Text differs for witness " . $wit->sigil 
			unless $pathtext eq $origtext;
		if( $wit->is_layered ) {
			$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
			$origtext = join( ' ', @{$wit->layertext} );
			warn "Ante-corr text differs for witness " . $wit->sigil
				unless $pathtext eq $origtext;
		} else {
			warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
				if $wit->has_layertext;
		}
	}
	
	# Note that our ranks and common readings are set.
	$c->_graphcalc_done(1);
	# Remove redundant collation relationships.
	$c->relations->filter_collations() unless $nocollate;
}

sub _table_from_input {
	my $opts = shift;
	my $alignment_table = [];
    if( $opts->{'excel'} ) {
    	my $sheet;
    	my $need_decode;
		unless( exists $opts->{'file'} ) {
			throw( "Must pass the filename for Excel parsing" );
		}
    	if( $opts->{'excel'} eq 'xls' ) {
			try {
				require Spreadsheet::ParseExcel;
			} catch {
				throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
			}
			my $parser = Spreadsheet::ParseExcel->new();
			my $workbook = $parser->parse( $opts->{'file'} );
			unless( defined $workbook && defined $workbook->worksheet(0) ) {
				throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
			}
			$sheet = $workbook->worksheet(0);
		} elsif( $opts->{'excel'} eq 'xlsx' ) {
			try {
				require Spreadsheet::XLSX;
			} catch {
				throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
			}
			$need_decode = 1;
			my $workbook;
			try {
				$workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
			} catch {
				throw( "Failed to parse file " . $opts->{'file'} );
			}
			$sheet = $workbook->worksheet(0);
		} else {
			throw( "Unrecognized Excel variant" . $opts->{'excel'} );
		}
		$alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
    } else {
    	# Assume it is a comma-, tab-, or whatever-separated format.
		my $csv_options = { 'binary' => 1 };
		$csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
		if( $csv_options->{'sep_char'} eq "\t" ) {
			# If it is really tab separated, nothing is an escape char.
			$csv_options->{'quote_char'} = undef;
			$csv_options->{'escape_char'} = undef;
		}
		my $csv = Text::CSV->new( $csv_options );
		
		if( exists $opts->{'string' } ) {
			my @lines = split( "\n", $opts->{'string'} );
			foreach my $l ( @lines ) {
				my $status = $csv->parse( $l );
				if( $status ) {
					push( @$alignment_table, [ $csv->fields ] );
				} else {
					throw( "Could not parse line $l: " . $csv->error_input );
				}
			}
		} elsif( exists $opts->{'file'} ) {
			open( my $fh, $opts->{'file'} ) 
				or warn "Could not open input file " . $opts->{'file'};
			binmode( $fh, ':utf8' );
			while( my $row = $csv->getline( $fh ) ) {
				push( @$alignment_table, $row );
			}
			close $fh;
		} else {
			throw( "Could not find string or file option to parse" );
		}
	}
	return $alignment_table;
}
sub _alignment_from_worksheet {
	my( $sheet, $decode ) = @_;
	my $alignment_table = [];
	
	my( $rmin, $rmax ) = $sheet->row_range();
	my( $cmin, $cmax ) = $sheet->col_range();
	unless( $cmax && $rmax ) {
		throw( "Found no rows or no columns in first worksheet" );
	}
	# Populate the alignment table. We only want columns that have
	# a sigil in row zero.
	my %sigcols = ();
	push( @$alignment_table, [] );
	foreach my $col ( $cmin .. $cmax ) {
		my $cell = $sheet->get_cell( $rmin, $col );
		my $cellval = $cell ? $cell->value() : undef;
		if( $cellval ) {
			$sigcols{$col} = 1;
			push( @{$alignment_table->[0]}, $cellval );
		}
	}
	# Now go through the rest of the rows and pick up the columns
	# that were headed by a sigil.
	foreach my $row ( $rmin+1 .. $rmax ) {
		my @tablerow;
		foreach my $col ( $cmin .. $cmax ) {
			next unless $sigcols{$col};
			my $cell = $sheet->get_cell( $row, $col );
			my $cellval;
			if( $cell ) {
				$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
			}
			push( @tablerow, $cellval );
		}
		push( @$alignment_table, \@tablerow );
	}
	return $alignment_table;
}

sub _make_nodes {
    my( $collation, $row, $index, $nocollate ) = @_;
    my %unique;
    my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
    foreach my $w ( @$row ) {
        $unique{$w} = 1 if $w;
        $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
    }
    my $ctr = 1;
    foreach my $w ( keys %unique ) {
    	my $rargs = {
    		'id' => "r$index.$ctr",
    		'rank' => $index,
    		'text' => $w,
    		};
    	if( $w eq '#LACUNA#' ) {
    		$rargs->{'is_lacuna'} = 1;
    	} elsif( $commonctr == 1 ) {
    		$rargs->{'is_common'} = 1;
    	}
        my $r = $collation->add_reading( $rargs );
        $unique{$w} = $r;
        $ctr++;
    }
    # Collate this sequence of readings via a single 'collation' relationship.
    unless( $nocollate ) {
		my @rankrdgs = values %unique;
		my $collation_rel;
		while( @rankrdgs ) {
			my $r = shift @rankrdgs;
			next if $r->is_meta;
			foreach my $nr ( @rankrdgs ) {
				next if $nr->is_meta;
				if( $collation_rel ) {
					$collation->add_relationship( $r, $nr, $collation_rel );
				} else {
					$collation->add_relationship( $r, $nr, 
						{ 'type' => 'collated', 
						  'annotation' => "Parsed together for rank $index" } );
					$collation_rel = $collation->get_relationship( $r, $nr );
				}
			}
		}
	}    
    return \%unique;
}

sub throw {
	Text::Tradition::Error->throw( 
		'ident' => 'Parser::Tabular error',
		'message' => $_[0],
		);
}

1;

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
d9e873d0	1	package Text::Tradition::Parser::Tabular;
	2
	3	use strict;
	4	use warnings;
3a3b8213	5	use Encode qw/ decode_utf8 /;
82fa4d57	6	use Text::CSV;
701ad2ba	7	use Text::Tradition::Error;
701ad2ba	8	use TryCatch;
d9e873d0	9
	10	=head1 NAME
	11
	12	Text::Tradition::Parser::Tabular
	13
3b853983	14	=head1 SYNOPSIS
	15
	16	use Text::Tradition;
	17
	18	my $t_from_file = Text::Tradition->new(
	19	'name' => 'my text',
	20	'input' => 'Tabular',
	21	'file' => '/path/to/collation.csv',
	22	'sep_char' => ','
	23	);
	24
	25	my $t_from_string = Text::Tradition->new(
	26	'name' => 'my text',
	27	'input' => 'Tabular',
	28	'string' => $tab_separated_collation,
	29	'sep_char' => "\t",
	30	);
	31
d9e873d0	32	=head1 DESCRIPTION
	33
	34	Parser module for Text::Tradition to read an alignment table format, such as CSV.
	35
	36	=head1 METHODS
	37
e867486f	38	=head2 B<parse>( $tradition, $option_hash )
3b853983	39
	40	Takes an initialized tradition and a set of options; creates the
	41	appropriate nodes and edges on the graph, as well as the appropriate
	42	witness objects. The $option_hash must contain either a 'file' or a
	43	'string' argument with the table to be parsed; it may also contain a
	44	'sep_char' argument to specify how the fields are separated.
	45
	46	The table should have witnesses arranged in columns, with the witness sigla
	47	in the first row. Empty cells are interpreted as omissions (and thus
	48	stemmatologically relevant.) Longer lacunae in the text, to be disregarded
	49	in cladistic analysis, may be represented by filling the appropriate cells
	50	with the tag '#LACUNA#'.
	51
	52	If a witness name ends in the collation's ac_label, it will be treated as
	53	an 'ante-correction' version of the 'main' witness whose sigil it shares.
	54
	55	=begin testing
	56
	57	use Text::Tradition;
	58	binmode STDOUT, ":utf8";
	59	binmode STDERR, ":utf8";
	60	eval { no warnings; binmode $DB::OUT, ":utf8"; };
	61
	62	my $csv = 't/data/florilegium.csv';
	63	my $t = Text::Tradition->new(
	64	'name' => 'inline',
	65	'input' => 'Tabular',
	66	'file' => $csv,
	67	'sep_char' => ',',
	68	);
d9e873d0	69
3b853983	70	is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
d9e873d0	71
3b853983	72	### TODO Check these figures
3b853983	73	if( $t ) {
0e47f4f6	74	is( scalar $t->collation->readings, 311, "Collation has all readings" );
0e47f4f6	75	is( scalar $t->collation->paths, 361, "Collation has all paths" );
3b853983	76	is( scalar $t->witnesses, 13, "Collation has all witnesses" );
	77	}
	78
b0b4421a	79	# Check that we have the right witnesses
	80	my %seen_wits;
	81	map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
	82	foreach my $wit ( $t->witnesses ) {
	83	$seen_wits{$wit->sigil} = 1;
	84	}
	85	is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
	86	foreach my $k ( keys %seen_wits ) {
	87	ok( $seen_wits{$k}, "Witness $k still exists" );
	88	}
	89
	90	# Check that the witnesses have the right texts
	91	foreach my $wit ( $t->witnesses ) {
	92	my $origtext = join( ' ', @{$wit->text} );
	93	my $graphtext = $t->collation->path_text( $wit->sigil );
	94	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
	95	}
	96
	97	# Check that the a.c. witnesses have the right text
	98	map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
	99	foreach my $k ( keys %seen_wits ) {
	100	my $wit = $t->witness( $k );
	101	if( $seen_wits{$k} ) {
	102	ok( $wit->is_layered, "Witness $k got marked as layered" );
	103	ok( $wit->has_layertext, "Witness $k has an a.c. version" );
	104	my $origtext = join( ' ', @{$wit->layertext} );
	105	my $acsig = $wit->sigil . $t->collation->ac_label;
861c3e27	106	my $graphtext = $t->collation->path_text( $acsig );
b0b4421a	107	is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
	108	} else {
	109	ok( !$wit->is_layered, "Witness $k not marked as layered" );
	110	ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
	111	}
	112	}
	113
cc31ebaa	114	# Check that we only have collation relationships where we need them
cc31ebaa	115	is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
cc31ebaa	116
701ad2ba	117	## Check excel parsing
	118
	119	my $xls = 't/data/armexample.xls';
	120	my $xt = Text::Tradition->new(
	121	'name' => 'excel test',
	122	'input' => 'Tabular',
	123	'file' => $xls,
3a3b8213	124	'excel' => 'xls'
701ad2ba	125	);
	126
	127	is( ref( $xt ), 'Text::Tradition', "Parsed test Excel 97-2004 file" );
	128	my %xls_wits;
	129	map { $xls_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
	130	foreach my $wit ( $xt->witnesses ) {
	131	$xls_wits{$wit->sigil} = 1;
	132	}
	133	is( scalar keys %xls_wits, 3, "No extra witnesses were made" );
	134	foreach my $k ( keys %xls_wits ) {
	135	ok( $xls_wits{$k}, "Witness $k still exists" );
	136	}
	137	is( scalar $xt->collation->readings, 11, "Got correct number of test readings" );
	138	is( scalar $xt->collation->paths, 13, "Got correct number of reading paths" );
	139	is( $xt->collation->reading('r5.1')->text, "\x{587}",
	140	"Correct decoding of at least one reading" );
	141
3a3b8213	142	my $xlsx = 't/data/armexample.xlsx';
	143	my $xtx = Text::Tradition->new(
	144	'name' => 'excel test',
	145	'input' => 'Tabular',
	146	'file' => $xlsx,
	147	'excel' => 'xlsx'
	148	);
	149
	150	is( ref( $xtx ), 'Text::Tradition', "Parsed test Excel 2007+ file" );
	151	my %xlsx_wits;
	152	map { $xlsx_wits{$_} = 0 } qw/ Wit1 Wit2 Wit3 /;
	153	foreach my $wit ( $xtx->witnesses ) {
	154	$xlsx_wits{$wit->sigil} = 1;
	155	}
	156	is( scalar keys %xlsx_wits, 3, "No extra witnesses were made" );
	157	foreach my $k ( keys %xlsx_wits ) {
	158	ok( $xlsx_wits{$k}, "Witness $k still exists" );
	159	}
	160	is( scalar $xtx->collation->readings, 12, "Got correct number of test readings" );
	161	is( scalar $xtx->collation->paths, 14, "Got correct number of reading paths" );
	162	is( $xtx->collation->reading('r5.1')->text, "\x{587}",
	163	"Correct decoding of at least one reading" );
	164
3b853983	165	=end testing
d9e873d0	166
	167	=cut
	168
	169	sub parse {
dfc37e38	170	my( $tradition, $opts ) = @_;
3a3b8213	171	my $alignment_table = _table_from_input( $opts );
d9e873d0	172	# Set up the witnesses we find in the first line
d9e873d0	173	my @witnesses;
b0b4421a	174	my %ac_wits; # Track layered witness -> main witness mapping
3a3b8213	175	my $c = $tradition->collation; # shorthand
82fa4d57	176	my $aclabel = $c->ac_label;
d9e873d0	177	foreach my $sigil ( @{$alignment_table->[0]} ) {
3b853983	178	if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
82fa4d57	179	# Sanitize the sigil name to an XML name
82fa4d57	180	$sigil = $1 . '_layered';
b0b4421a	181	$ac_wits{$sigil} = $1;
3b853983	182	}
82fa4d57	183	my $wit = $tradition->add_witness(
	184	'sigil' => $sigil, 'sourcetype' => 'collation' );
	185	$wit->path( [ $c->start ] );
	186	push( @witnesses, $wit );
	187	my $aclabel = $c->ac_label;
d9e873d0	188	}
d9e873d0	189
b0b4421a	190	# Save the original witness text sequences. Have to loop back through
	191	# the witness columns after we have identified all the a.c. witnesses.
	192	foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
	193	my @sequence = map { $_->[$idx] } @{$alignment_table};
	194	my $sigil = shift @sequence;
	195	my $is_layer = exists( $ac_wits{$sigil} );
	196	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );
	197	# Now get rid of gaps and meta-readings like #LACUNA#
	198	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
	199	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
	200	}
	201
9bdf9d67	202	my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
	203	print STDERR "Tradition too big for row collation\n" if $nocollate;
	204
d9e873d0	205	# Now for the next rows, make nodes as necessary, assign their ranks, and
d9e873d0	206	# add them to the witness paths.
d9e873d0	207	foreach my $idx ( 1 .. $#{$alignment_table} ) {
d9e873d0	208	my $row = $alignment_table->[$idx];
9bdf9d67	209	my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
d9e873d0	210	foreach my $w ( 0 .. $#{$row} ) {
	211	# push the appropriate node onto the appropriate witness path
	212	my $word = $row->[$w];
	213	if( $word ) {
	214	my $reading = $nodes->{$word};
	215	my $wit = $witnesses[$w];
	216	push( @{$wit->path}, $reading );
	217	} # else skip it for empty readings.
	218	}
	219	}
	220
eca16057	221	# Collapse our lacunae into a single node and
eca16057	222	# push the end node onto all paths.
d9e873d0	223	$c->end->rank( scalar @$alignment_table );
d9e873d0	224	foreach my $wit ( @witnesses ) {
eca16057	225	my $p = $wit->path;
	226	my $last_rdg = shift @$p;
	227	my $new_p = [ $last_rdg ];
	228	foreach my $rdg ( @$p ) {
83d5ac3a	229	# Omit the reading if we are in a lacuna already.
	230	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
	231	# Save the reading otherwise.
	232	push( @$new_p, $rdg );
	233	$last_rdg = $rdg;
eca16057	234	}
	235	push( @$new_p, $c->end );
	236	$wit->path( $new_p );
d9e873d0	237	}
d9e873d0	238
3b853983	239	# Fold any a.c. witnesses into their main witness objects, and
	240	# delete the independent a.c. versions.
	241	foreach my $a ( keys %ac_wits ) {
b0b4421a	242	my $ac_wit = $tradition->witness( $a );
b0b4421a	243	my $main_wit = $tradition->witness( $ac_wits{$a} );
3b853983	244	next unless $main_wit;
861c3e27	245	$main_wit->is_layered(1);
3b853983	246	$main_wit->uncorrected_path( $ac_wit->path );
	247	$tradition->del_witness( $ac_wit );
	248	}
83d5ac3a	249
d9e873d0	250	# Join up the paths.
d9e873d0	251	$c->make_witness_paths;
83d5ac3a	252	# Delete our unused lacuna nodes.
	253	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
	254	$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
	255	}
861c3e27	256
	257	# Do a consistency check.
	258	foreach my $wit ( $tradition->witnesses ) {
	259	my $pathtext = $c->path_text( $wit->sigil );
	260	my $origtext = join( ' ', @{$wit->text} );
	261	warn "Text differs for witness " . $wit->sigil
	262	unless $pathtext eq $origtext;
	263	if( $wit->is_layered ) {
	264	$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
	265	$origtext = join( ' ', @{$wit->layertext} );
	266	warn "Ante-corr text differs for witness " . $wit->sigil
	267	unless $pathtext eq $origtext;
	268	} else {
	269	warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
	270	if $wit->has_layertext;
	271	}
	272	}
202ccb18	273
	274	# Note that our ranks and common readings are set.
	275	$c->_graphcalc_done(1);
cc31ebaa	276	# Remove redundant collation relationships.
9bdf9d67	277	$c->relations->filter_collations() unless $nocollate;
d9e873d0	278	}
d9e873d0	279
3a3b8213	280	sub _table_from_input {
	281	my $opts = shift;
	282	my $alignment_table = [];
	283	if( $opts->{'excel'} ) {
	284	my $sheet;
	285	my $need_decode;
	286	unless( exists $opts->{'file'} ) {
	287	throw( "Must pass the filename for Excel parsing" );
	288	}
	289	if( $opts->{'excel'} eq 'xls' ) {
	290	try {
	291	require Spreadsheet::ParseExcel;
	292	} catch {
	293	throw( "Need module Spreadsheet::ParseExcel to parse .xls files" );
	294	}
	295	my $parser = Spreadsheet::ParseExcel->new();
	296	my $workbook = $parser->parse( $opts->{'file'} );
	297	unless( defined $workbook && defined $workbook->worksheet(0) ) {
	298	throw( "Failed to parse file " . $opts->{'file'} . ": " . $parser->error() );
	299	}
	300	$sheet = $workbook->worksheet(0);
	301	} elsif( $opts->{'excel'} eq 'xlsx' ) {
	302	try {
	303	require Spreadsheet::XLSX;
	304	} catch {
	305	throw( "Need module Spreadsheet::XLSX to parse .xlsx files" );
	306	}
	307	$need_decode = 1;
	308	my $workbook;
	309	try {
	310	$workbook = Spreadsheet::XLSX->new( $opts->{'file'} );
	311	} catch {
	312	throw( "Failed to parse file " . $opts->{'file'} );
	313	}
	314	$sheet = $workbook->worksheet(0);
	315	} else {
	316	throw( "Unrecognized Excel variant" . $opts->{'excel'} );
	317	}
	318	$alignment_table = _alignment_from_worksheet( $sheet, $need_decode );
	319	} else {
	320	# Assume it is a comma-, tab-, or whatever-separated format.
	321	my $csv_options = { 'binary' => 1 };
	322	$csv_options->{'sep_char'} = $opts->{'sep_char'} \|\| "\t";
	323	if( $csv_options->{'sep_char'} eq "\t" ) {
	324	# If it is really tab separated, nothing is an escape char.
	325	$csv_options->{'quote_char'} = undef;
	326	$csv_options->{'escape_char'} = undef;
	327	}
	328	my $csv = Text::CSV->new( $csv_options );
	329
	330	if( exists $opts->{'string' } ) {
	331	my @lines = split( "\n", $opts->{'string'} );
	332	foreach my $l ( @lines ) {
	333	my $status = $csv->parse( $l );
	334	if( $status ) {
	335	push( @$alignment_table, [ $csv->fields ] );
	336	} else {
	337	throw( "Could not parse line $l: " . $csv->error_input );
	338	}
	339	}
	340	} elsif( exists $opts->{'file'} ) {
	341	open( my $fh, $opts->{'file'} )
	342	or warn "Could not open input file " . $opts->{'file'};
	343	binmode( $fh, ':utf8' );
344	while( my $row = $csv->getline( $fh ) ) {
345	push( @$alignment_table, $row );
346	}
347	close $fh;
348	} else {
349	throw( "Could not find string or file option to parse" );
350	}
351	}
352	return $alignment_table;
353	}
354	sub _alignment_from_worksheet {
355	my( $sheet, $decode ) = @_;
356	my $alignment_table = [];
357
358	my( $rmin, $rmax ) = $sheet->row_range();
359	my( $cmin, $cmax ) = $sheet->col_range();
360	unless( $cmax && $rmax ) {
361	throw( "Found no rows or no columns in first worksheet" );
362	}
363	# Populate the alignment table. We only want columns that have
364	# a sigil in row zero.
365	my %sigcols = ();
366	push( @$alignment_table, [] );
367	foreach my $col ( $cmin .. $cmax ) {
368	my $cell = $sheet->get_cell( $rmin, $col );
369	my $cellval = $cell ? $cell->value() : undef;
370	if( $cellval ) {
371	$sigcols{$col} = 1;
372	push( @{$alignment_table->[0]}, $cellval );
373	}
374	}
375	# Now go through the rest of the rows and pick up the columns
376	# that were headed by a sigil.
377	foreach my $row ( $rmin+1 .. $rmax ) {
378	my @tablerow;
379	foreach my $col ( $cmin .. $cmax ) {
380	next unless $sigcols{$col};
381	my $cell = $sheet->get_cell( $row, $col );
382	my $cellval;
383	if( $cell ) {
384	$cellval = $decode ? decode_utf8( $cell->value ) : $cell->value;
385	}
386	push( @tablerow, $cellval );
387	}
388	push( @$alignment_table, \@tablerow );
389	}
390	return $alignment_table;
391	}
392
027d819c	393	sub _make_nodes {
9bdf9d67	394	my( $collation, $row, $index, $nocollate ) = @_;
d9e873d0	395	my %unique;
15db7774	396	my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
d9e873d0	397	foreach my $w ( @$row ) {
d9e873d0	398	$unique{$w} = 1 if $w;
15db7774	399	$commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
d9e873d0	400	}
	401	my $ctr = 1;
	402	foreach my $w ( keys %unique ) {
a753cc84	403	my $rargs = {
10e4b1ac	404	'id' => "r$index.$ctr",
a753cc84	405	'rank' => $index,
	406	'text' => $w,
	407	};
15db7774	408	if( $w eq '#LACUNA#' ) {
	409	$rargs->{'is_lacuna'} = 1;
	410	} elsif( $commonctr == 1 ) {
	411	$rargs->{'is_common'} = 1;
	412	}
a753cc84	413	my $r = $collation->add_reading( $rargs );
d9e873d0	414	$unique{$w} = $r;
a753cc84	415	$ctr++;
d9e873d0	416	}
bf6e338d	417	# Collate this sequence of readings via a single 'collation' relationship.
9bdf9d67	418	unless( $nocollate ) {
	419	my @rankrdgs = values %unique;
	420	my $collation_rel;
	421	while( @rankrdgs ) {
	422	my $r = shift @rankrdgs;
	423	next if $r->is_meta;
	424	foreach my $nr ( @rankrdgs ) {
	425	next if $nr->is_meta;
	426	if( $collation_rel ) {
	427	$collation->add_relationship( $r, $nr, $collation_rel );
	428	} else {
	429	$collation->add_relationship( $r, $nr,
	430	{ 'type' => 'collated',
	431	'annotation' => "Parsed together for rank $index" } );
	432	$collation_rel = $collation->get_relationship( $r, $nr );
	433	}
	434	}
	435	}
	436	}
d9e873d0	437	return \%unique;
	438	}
	439
3a3b8213	440	sub throw {
	441	Text::Tradition::Error->throw(
	442	'ident' => 'Parser::Tabular error',
	443	'message' => $_[0],
	444	);
	445	}
	446
3b853983	447	1;
	448
	449	=head1 LICENSE
	450
	451	This package is free software and is provided "as is" without express
	452	or implied warranty. You can redistribute it and/or modify it under
	453	the same terms as Perl itself.
	454
	455	=head1 AUTHOR
	456
	457	Tara L Andrews E<lt>aurum@cpan.orgE<gt>