Merge branch 'master' of https://github.com/tla/stemmatology
[scpubgit/stemmatology.git] / base / script / make_tradition.pl
CommitLineData
910a0a6d 1#!/usr/bin/env perl
2
3use lib 'lib';
4use strict;
5use warnings;
6use Getopt::Long;
56c56f0d 7use TryCatch;
910a0a6d 8use Text::Tradition;
861c3e27 9use Text::Tradition::Directory;
d9befa73 10use Text::Tradition::StemmaUtil qw/ character_input phylip_pars newick_to_svg /;
910a0a6d 11
12binmode STDERR, ":utf8";
13binmode STDOUT, ":utf8";
14eval { no warnings; binmode $DB::OUT, ":utf8"; };
15
10943ab0 16# Variables with defaults
17my( $informat, $outformat, $language, $name, $sep, $dsn ) = ( '', '', 'Default',
d8dd6236 18 'Tradition', "\t", "dbi:SQLite:dbname=db/traditions.db" );
10943ab0 19# Variables with no default
a188b944 20my( $inbase, $help, $stemmafile, $dbuser, $dbpass, $from, $to, $dbid,
21 $nocalc, $nonlinear );
910a0a6d 22
408449b7 23GetOptions( 'i|in=s' => \$informat,
24 'b|base=s' => \$inbase,
25 'o|out=s' => \$outformat,
6a1c434d 26 'l|language=s' => \$language,
861c3e27 27 'n|name=s' => \$name,
408449b7 28 'h|help' => \$help,
861c3e27 29 's|stemma=s' => \$stemmafile,
7d99d254 30 'u|user=s' => \$dbuser,
31 'p|pass=s' => \$dbpass,
fd7014c4 32 'f|from=s' => \$from,
33 't|to=s' => \$to,
b63589d0 34 'nl|nonlinear' => \$nonlinear,
a7fb3133 35 'sep=s' => \$sep,
861c3e27 36 'dsn=s' => \$dsn,
a188b944 37 'dbid=s' => \$dbid,
38 'nc|nocalc' => \$nocalc,
910a0a6d 39 );
40
41if( $help ) {
42 help();
43}
44
ce5966fb 45unless( $informat =~ /^(CTE|KUL|Self|TEI|CollateX|tab(ular)?)|xlsx?|db$/i ) {
46 help( "Input format must be one of CollateX, CTE, Self, TEI, Tabular, XLS(X), or DB" );
910a0a6d 47}
9a36afc0 48my $excel = $informat =~ /^xls/i ? lc( $informat ) : undef;
910a0a6d 49$informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i;
50$informat = 'KUL' if $informat =~ /^kul$/i;
51$informat = 'CTE' if $informat =~ /^cte$/i;
52$informat = 'Self' if $informat =~ /^self$/i;
53$informat = 'TEI' if $informat =~ /^tei$/i;
d9e873d0 54$informat = 'Tabular' if $informat =~ /^tab$/i;
fa954f4c 55$informat = 'CollateText' if $informat =~ /^stone$/i;
9a36afc0 56$informat = 'Tabular' if $informat =~ /^xls/i;
910a0a6d 57
87b00ae5 58unless( $outformat =~ /^(graphml|svg|dot|adj(acency)?|stemma(svg)?|(c|t)sv|db)$/i ) {
59 help( "Output format must be one of db, graphml, svg, csv, tsv, stemma, adjacency, or dot" );
910a0a6d 60}
87b00ae5 61$outformat = 'adjacency_list' if $outformat =~ /^adj/i;
910a0a6d 62
fd7014c4 63if( $from || $to ) {
87b00ae5 64 help( "Subgraphs only supported in GraphML, dot, adjacency, or SVG format" )
65 unless $outformat =~ /^(graphml|dot|svg|adjacency_list)$/;
fd7014c4 66}
67
910a0a6d 68# Do we have a base if we need it?
fa954f4c 69if( $informat =~ /^(KUL|CollateText)$/ && !$inbase ) {
910a0a6d 70 help( "$informat input needs a base text" );
71}
aa71409f 72$sep = "\t" if $sep eq 'tab';
fa954f4c 73
910a0a6d 74my $input = $ARGV[0];
00e822da 75my $tradition;
76my $dir;
77if( $informat eq 'db' ) {
78 my $dbargs = { dsn => $dsn };
79 $dbargs->{'extra_args'}->{'user'} = $dbuser if $dbuser;
80 $dbargs->{'extra_args'}->{'password'} = $dbpass if $dbpass;
81 $dir = Text::Tradition::Directory->new( $dbargs );
82 my $scope = $dir->new_scope();
83 $tradition = $dir->lookup( $input );
84} else {
85 # First: read the base. Make a graph, but also note which
86 # nodes represent line beginnings.
87 my %args = ( 'input' => $informat,
88 'file' => $input );
b63589d0 89 $args{'linear'} = 0 if $nonlinear;
00e822da 90 $args{'base'} = $inbase if $inbase;
91 $args{'language'} = $language if $language;
92 $args{'name'} = $name if $name;
a188b944 93 $args{'nocalc'} = 1 if $nocalc;
9a36afc0 94 if( $informat eq 'Tabular' ) {
95 if( $excel ) {
96 $args{'excel'} = $excel;
97 } else {
98 $args{'sep_char'} = $sep;
99 }
100 }
1922aeb5 101 # If we are writing to the database, use that DB as the userstore.
102 if( $outformat eq 'db' ) {
103 unless( $dir ) {
104 my $extra_args = { 'create' => 1 };
105 $extra_args->{'user'} = $dbuser if $dbuser;
106 $extra_args->{'password'} = $dbpass if $dbpass;
107 $dir = Text::Tradition::Directory->new( 'dsn' => $dsn,
108 'extra_args' => $extra_args );
109 }
110 $args{'userstore'} = $dir;
111 }
00e822da 112 ### Custom hacking for Stone
113 if( $informat eq 'CollateText' ) {
114 $args{'sigla'} = [ qw/ S M X V Z Bb B K W L / ];
115 }
1922aeb5 116 my $scope = $dir->new_scope() if $dir;
00e822da 117 $tradition = Text::Tradition->new( %args );
fa954f4c 118}
861c3e27 119if( $stemmafile ) {
173ecc07 120 my $stemma = $tradition->add_stemma( dotfile => $stemmafile );
861c3e27 121 print STDERR "Saved stemma at $stemmafile\n" if $stemma;
122}
910a0a6d 123
910a0a6d 124# Now output what we have been asked to.
d9befa73 125if( $outformat =~ /^stemma(.*)$/ ) {
126 my $type = $1 || 'newick';
b39e7cb5 127 my $cdata = character_input( $tradition );
d9befa73 128 my $newick;
56c56f0d 129 try {
d9befa73 130 $newick = phylip_pars( $cdata );
56c56f0d 131 } catch( Text::Tradition::Error $e ) {
d9befa73 132 print STDERR "Bad result from pars: " . $e->message;
133 exit;
134 }
135 if( $type eq 'newick' ) {
136 print $newick;
137 } elsif( $type eq 'svg' ) {
138 print newick_to_svg( $newick );
910a0a6d 139 }
861c3e27 140} elsif( $outformat eq 'db' ) {
00e822da 141 unless( $dir ) {
142 my $extra_args = { 'create' => 1 };
143 $extra_args->{'user'} = $dbuser if $dbuser;
144 $extra_args->{'password'} = $dbpass if $dbpass;
145 $dir = Text::Tradition::Directory->new( 'dsn' => $dsn,
146 'extra_args' => $extra_args );
147 }
861c3e27 148 my $scope = $dir->new_scope;
28333e88 149 my $uuid;
150 if( $dbid ) {
151 $uuid = $dir->store( $dbid => $tradition );
152 } else {
153 $uuid = $dir->store( $tradition );
154 }
861c3e27 155 print STDERR "Saved tradition to database with ID $uuid\n";
910a0a6d 156} else {
157 my $output = "as_$outformat";
fd7014c4 158 my $opts = {};
159 $opts->{'from'} = $from if $from;
160 $opts->{'to'} = $to if $to;
a188b944 161 $opts->{'nocalc'} = 1 if $nocalc;
fd7014c4 162 print $tradition->collation->$output( $opts );
910a0a6d 163}
164
165sub help {
166 my( $msg ) = @_;
167 print STDERR << "EOF"
168Usage: $0 -i [format] -o [format] (--base [filename]) (--(no)linear) [inputfile]
169 i, input: Format of the input file. Must be one of CollateX, CSV, CTE, Self, TEI.
170 o, output: Format of the output. Must be one of svg, dot, graphml, csv, stemma.
171 b, base: Filename that contains a base text. Needed for CSV input.
172 l, linear: Treat transposed readings separately, producing a linear graph.
173 If nolinear, treat transposed readings as the same node.
174 h, help: Print this message.
175EOF
176 ;
177 if( $msg ) {
178 print STDERR "$msg\n";
179 }
180 exit ($msg ? 1 : 0 );
181}