From: Tara L Andrews <tla@mit.edu>
Date: Wed, 18 Jan 2012 20:29:11 +0000 (+0100)
Subject: remove punctuation handling logic; we will do this with relationships instead
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=30f0df340a2e9ca60316a1175f09f7388c113289;p=scpubgit%2Fstemmatology.git

remove punctuation handling logic; we will do this with relationships instead
---

diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index f713311..ee8e9da 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -66,12 +66,6 @@ has 'linear' => (
     default => 1,
     );
     
-has 'collapse_punctuation' => (
-	is => 'rw',
-	isa => 'Bool',
-	default => 1,
-	);
-
 has 'ac_label' => (
     is => 'rw',
     isa => 'Str',
@@ -133,8 +127,6 @@ belongs. Required.
 transposed readings should be treated as two linked readings rather than one, 
 and therefore whether the collation graph is acyclic.  Defaults to true.
 
-=item * collapse_punctuation - TODO
-
 =item * baselabel - The default label for the path taken by a base text 
 (if any). Defaults to 'base text'.
 
@@ -154,8 +146,6 @@ the like.  Defaults to ' (a.c.)'.
 
 =head2 linear
 
-=head2 collapse_punctuation
-
 =head2 wit_list_separator
 
 =head2 baselabel
@@ -554,7 +544,7 @@ sub as_dot {
         $used{$reading->id} = 1;
         # Need not output nodes without separate labels
         next if $reading->id eq $reading->text;
-        my $label = $reading->punctuated_form;
+        my $label = $reading->text;
         $label =~ s/\"/\\\"/g;
         $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
     }
@@ -774,7 +764,6 @@ sub as_graphml {
         $node_el->setAttribute( 'id', $node_xmlid );
         foreach my $d ( keys %node_data ) {
         	my $nval = $n->$d;
-        	$nval = $n->punctuated_form if $d eq 'text';
         	_add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
         		if defined $nval;
         }
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index 6664fee..5dfa552 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -82,22 +82,6 @@ has 'text' => (
 	writer => 'alter_text',
 	);
 	
-has 'punctuation' => (
-	traits => ['Array'],
-	isa => 'ArrayRef[HashRef[Str]]',
-	default => sub { [] },
-	handles => {
-			punctuation => 'elements',
-			add_punctuation => 'push',
-			},
-	);
-
-has 'separate_punctuation' => (
-	is => 'ro',
-	isa => 'Bool',
-	default => 1,
-	);
-
 has 'is_start' => (
 	is => 'ro',
 	isa => 'Bool',
@@ -143,18 +127,12 @@ around BUILDARGS => sub {
 	if( exists $args->{'json'} ) {
 		my $j = delete $args->{'json'};
 
-		# If we have separated punctuation and don't want it, restore it.
-		if( exists $j->{'punctuation'}
-			&& exists $args->{'separate_punctuation'}
-			&& !$args->{'separate_punctuation'} ) {
+		# If we have separated punctuation, restore it.
+		if( exists $j->{'punctuation'} ) {
 			$args->{'text'} = _restore_punct( $j->{'t'}, $j->{'punctuation'} );
-
-		# In all other cases, keep text and punct as they are.
 		} else {
 			$args->{'text'} = $j->{'t'};
-			# we don't use comparison or canonical forms here
-			$args->{'punctuation'} = $j->{'punctuation'}
-				if exists $j->{'punctuation'};
+			# we don't use comparison or canonical forms yet
 		}
 	}
 		
@@ -176,30 +154,7 @@ around BUILDARGS => sub {
 	$class->$orig( $args );
 };
 
-# Post-process the given text, stripping punctuation if we are asked.
-sub BUILD {
-	my $self = shift;
-	if( $self->separate_punctuation && !$self->is_meta
-		&& !$self->punctuation ) {
-		my $pos = 0;
-		my $wspunct = '';  # word sans punctuation
-		foreach my $char ( split( //, $self->text ) ) {
-			if( $char =~ /^[[:punct:]]$/ ) {
-				$self->add_punctuation( { 'char' => $char, 'pos' => $pos } );
-			} else {
-				$wspunct .= $char;
-			}
-			$pos++;
-		}
-		$self->alter_text( $wspunct );
-	}
-}
-
-sub punctuated_form {
-	my $self = shift;
-	return _restore_punct( $self->text, $self->punctuation );
-}
-
+# Utility function for parsing JSON from nCritic
 sub _restore_punct {
 	my( $word, @punct ) = @_;
 	foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @punct ) {
diff --git a/lib/Text/Tradition/Parser/JSON.pm b/lib/Text/Tradition/Parser/JSON.pm
index 8d0aa8c..913a21b 100644
--- a/lib/Text/Tradition/Parser/JSON.pm
+++ b/lib/Text/Tradition/Parser/JSON.pm
@@ -87,7 +87,7 @@ sub parse {
 	my $table = from_json( $opts->{'string'} );
 	
 	# Create the witnesses
-    my @witnesses;
+	my @witnesses; # Keep the ordered list of our witnesses
     my %ac_wits;  # Track these for later removal
     foreach my $sigil ( map { $_->{'witness'} } @{$table->{'alignment'}} ) {
         my $wit = $tradition->add_witness( 'sigil' => $sigil );
@@ -163,19 +163,28 @@ $index.  Returns an array of readings of the same size as the original @tokenlis
 sub make_nodes {
 	my( $c, $idx, @tokens ) = @_;
 	my %unique;
-	my $ctr = 1;
-	foreach my $t ( @tokens ) {
-		next unless $t;
-		my $id = join( ',', $idx, $ctr++ );
-		my $rdg = Text::Tradition::Collation::Reading->new( 
-			'id' => $id, 'json' => $t, 'collation' => $c );
-		my $comptoken = $c->collapse_punctuation ? $rdg->text 
-			: $rdg->punctuated_form;
-		$unique{$comptoken} = $rdg;
-		$t->{'comptoken'} = $comptoken;
+	my @readings;
+	foreach my $j ( 0 .. $#tokens ) {
+		if( $tokens[$j] ) {
+			my $t = $tokens[$j];
+			my $rdg;
+			if( exists( $unique{$t->{'t'}} ) ) {
+				$rdg = $unique{$t->{'t'}};
+			} else {
+				my %args = ( 'id' => join( ',', $idx, $j+1 ),
+					'json' => $t,
+					'collation' => $c );
+				$args{'is_lacuna'} = 1 if $t->{'t'} eq '#LACUNA#';
+				$rdg = Text::Tradition::Collation::Reading->new( %args );
+				$unique{$t->{'t'}} = $rdg;
+			}
+			push( @readings, $rdg );
+		} else {
+			push( @readings, undef );
+		}
 	}
 	map { $c->add_reading( $_ ) } values( %unique );
-	return map { $_ && $unique{$_->{'comptoken'}} } @tokens;
+	return @readings;
 }
 
 1;