Fix graphml output / input format
[scpubgit/stemmatology.git] / make_tradition.pl
CommitLineData
910a0a6d 1#!/usr/bin/env perl
2
3use lib 'lib';
4use strict;
5use warnings;
6use Getopt::Long;
7use Text::Tradition;
8use Text::Tradition::Stemma;
9
10binmode STDERR, ":utf8";
11binmode STDOUT, ":utf8";
12eval { no warnings; binmode $DB::OUT, ":utf8"; };
13
14my( $informat, $inbase, $outformat, $help, $linear, $HACK )
15 = ( '', '', '', '', 1, 0 );
16
17GetOptions( 'i|in=s' => \$informat,
18 'b|base=s' => \$inbase,
19 'o|out=s' => \$outformat,
20 'l|linear!' => \$linear,
21 'h|help' => \$help,
22 'hack' => \$HACK,
23 );
24
25if( $help ) {
26 help();
27}
28
d9e873d0 29unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX|tab(ular)?)$/i ) {
910a0a6d 30 help( "Input format must be one of CollateX, CSV, CTE, Self, TEI" );
31}
32$informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i;
33$informat = 'KUL' if $informat =~ /^kul$/i;
34$informat = 'CTE' if $informat =~ /^cte$/i;
35$informat = 'Self' if $informat =~ /^self$/i;
36$informat = 'TEI' if $informat =~ /^tei$/i;
d9e873d0 37$informat = 'Tabular' if $informat =~ /^tab$/i;
910a0a6d 38
39unless( $outformat =~ /^(graphml|svg|dot|stemma|csv)$/ ) {
40 help( "Output format must be one of graphml, svg, csv, stemma, or dot" );
41}
42
43# Do we have a base if we need it?
44if( $informat eq 'KUL' && !$inbase ) {
45 help( "$informat input needs a base text" );
46}
47
48# CSV parsing requires a filename; XML parsing requires a string.
49my $input = $ARGV[0];
50unless( $informat eq 'KUL' || $informat eq 'CSV' ) {
51 my @lines;
52 open( INFILE, "$input" ) or die "Could not read $input";
d9e873d0 53 binmode INFILE, ':utf8';
910a0a6d 54 @lines = <INFILE>;
55 close INFILE;
56 $input = join( '', @lines );
57}
58
59# First: read the base. Make a graph, but also note which
60# nodes represent line beginnings.
61my %args = ( $informat => $input,
62 'linear' => $linear );
63$args{'base'} = $inbase if $inbase;
64my $tradition = Text::Tradition->new( %args );
65
66### Custom hacking
67# Remove witnesses C, E, G in the Matthew text
68if( $HACK ) {
69 foreach( $tradition->collation->paths() ) {
70 $tradition->collation->del_path( $_ ) if $_->label =~ /^[ceg]$/i;
71 }
72 foreach( $tradition->collation->readings() ) {
73 if( !$_->outgoing() && !$_->incoming() ) {
74 print STDERR "Deleting reading " . $_->label . "\n";
75 $tradition->collation->del_reading( $_ );
76 }
77 }
78}
79
80# Now output what we have been asked to.
81if( $outformat eq 'stemma' ) {
82 my $stemma = Text::Tradition::Stemma->new(
83 'collation' => $tradition->collation );
40f19742 84 my( $result, $tree ) = $stemma->run_phylip_pars();
910a0a6d 85 if( $result ) {
86 print $tree;
87 } else {
88 print STDERR "Bad result: $tree";
89 }
90} else {
91 my $output = "as_$outformat";
92 print $tradition->collation->$output();
93}
94
95sub help {
96 my( $msg ) = @_;
97 print STDERR << "EOF"
98Usage: $0 -i [format] -o [format] (--base [filename]) (--(no)linear) [inputfile]
99 i, input: Format of the input file. Must be one of CollateX, CSV, CTE, Self, TEI.
100 o, output: Format of the output. Must be one of svg, dot, graphml, csv, stemma.
101 b, base: Filename that contains a base text. Needed for CSV input.
102 l, linear: Treat transposed readings separately, producing a linear graph.
103 If nolinear, treat transposed readings as the same node.
104 h, help: Print this message.
105EOF
106 ;
107 if( $msg ) {
108 print STDERR "$msg\n";
109 }
110 exit ($msg ? 1 : 0 );
111}