From: Tara L Andrews Date: Fri, 3 Feb 2012 19:54:45 +0000 (+0100) Subject: post-processing script to get rid of punctuation X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6f0ec5df2f4fb5ee38786f98a81439fcf8eab3ba;p=scpubgit%2Fstemmatology.git post-processing script to get rid of punctuation --- diff --git a/script/strip_punctuation.pl b/script/strip_punctuation.pl new file mode 100755 index 0000000..2b4ec62 --- /dev/null +++ b/script/strip_punctuation.pl @@ -0,0 +1,38 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use strict; +use warnings; +use Text::Tradition::Directory; + +my( $dsn, $user, $pass ) = @ARGV; + +my $connect_args = { dsn => $dsn }; +$connect_args->{'extra_args'} = { user => $user, password => $pass } + if $user && $pass; +my $dir = Text::Tradition::Directory->new( $connect_args ); + +foreach my $id ( $dir->tradition_ids ) { + my $scope = $dir->new_scope; + my $tradition = $dir->lookup( $id ); + print STDERR "Processing tradition " . $tradition->name . "\n"; + foreach my $reading ( $tradition->collation->readings ) { + $reading->alter_text( strip_punct( $reading->text ) ); + } + $tradition->collation->flatten_ranks; + $dir->save( $tradition ); +} + +print STDERR "Done\n"; + +sub strip_punct { + my( $rtext ) = @_; + my $orig_r = $rtext; + return $rtext unless $rtext =~ /\w/; + $rtext =~ s/^\W+//; + $rtext =~ s/\W+$//; + print STDERR "Altering $orig_r to $rtext\n" + unless $orig_r eq $rtext; + return $rtext; +} + \ No newline at end of file