From: Tara L Andrews <tla\@mit.edu>
Date: Fri, 6 Jan 2012 15:38:45 +0000 (+0100)
Subject: fix up CTE parser, including an ugly hack I need, with new graph
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=127201444b501c247b995702f092c943f303980c;p=scpubgit%2Fstemmatology.git

fix up CTE parser, including an ugly hack I need, with new graph
---

diff --git a/TreeOfTexts/lib/TreeOfTexts/Controller/Root.pm b/TreeOfTexts/lib/TreeOfTexts/Controller/Root.pm
index 9707dfa..3141c0b 100644
--- a/TreeOfTexts/lib/TreeOfTexts/Controller/Root.pm
+++ b/TreeOfTexts/lib/TreeOfTexts/Controller/Root.pm
@@ -24,7 +24,8 @@ TreeOfTexts::Controller::Root - Root Controller for TreeOfTexts
 
 =head2 index
 
-The root page (/)
+The root page (/).  Lists the traditions available in the DB to work on,
+and should also eventually have an 'Upload new' interface.
 
 =cut
 
@@ -33,11 +34,10 @@ sub index :Path :Args(0) {
 
     my $m = $c->model('Directory');
     my @all_texts;
-    foreach my $id ( $m->traditions ) {
+    foreach my $id ( $m->tradition_ids ) {
     	my $data = { 
     		'id' => $id,
-    		'name' => $m->tradition( $id )->name,
-    		'has_stemma' => defined $m->stemma( $id ),
+    		'name' => $m->name( $id ),
     	};
     	push( @all_texts, $data );
     }
@@ -46,6 +46,17 @@ sub index :Path :Args(0) {
     $c->stash->{template} = 'frontpage.tt';
 }
 
+=head2 tradition (TODO)
+
+The main page for a tradition, with information about it and links to the
+available tools.
+
+=head2 relationships
+
+The relationship editor tool.
+
+=cut
+
 sub relationships :Local {
 	my( $self, $c ) = @_;
 	my $m = $c->model('Directory');
@@ -54,22 +65,37 @@ sub relationships :Local {
 	$c->stash->{template} = 'relationships.tt';	
 }
 
+=head2 stexaminer
+
+The stemma analysis tool with the pretty colored table.
+
+=cut
+
 sub stexaminer :Local {
     my( $self, $c ) = @_;
     my $m = $c->model('Directory');
-	my $id = $c->request->params->{'textid'};
-	my $tradition = $m->tradition( $id );
-	my $stemma = $m->stemma( $id );
-	my $t = run_analysis( $tradition, $stemma );
+	my $tradition = $m->tradition( $c->request->params->{'textid'} );
+	my $stemma = $tradition->stemma;
+	# TODO Think about caching the stemma in a session 
 	$c->stash->{svg} = $stemma->as_svg;
-	$c->stash->{variants} = $t->{'variants'};
 	$c->stash->{text_title} = $tradition->name;
+	$c->stash->{template} = 'index.tt'; 
+	# TODO Run the analysis as AJAX from the loaded page.
+	my $t = run_analysis( $tradition );
+	$c->stash->{variants} = $t->{'variants'};
 	$c->stash->{total} = $t->{'variant_count'};
 	$c->stash->{genealogical} = $t->{'genealogical_count'};
 	$c->stash->{conflict} = $t->{'conflict_count'};
-	$c->stash->{template} = 'index.tt'; 
 }
 
+=head1 OPENSOCIAL URLs
+
+=head2 view_table
+
+Simple gadget to return the analysis table for the stexaminer
+
+=cut
+
 sub view_table :Local {
     my( $self, $c ) = @_;
     my $m = $c->model('Directory');
@@ -79,10 +105,16 @@ sub view_table :Local {
     $c->stash->{template} = 'table_gadget.tt';
 }
 
+=head2 view_svg
+
+Simple gadget to return the SVG for a given stemma
+
+=cut
+
 sub view_svg :Local {
     my( $self, $c ) = @_;
     my $m = $c->model('Directory');
-	my $stemma = $m->stemma( $c->request->params->{'textid'} );
+    my $stemma = $m->tradition( $c->request->params->{'textid'} )->stemma;
 	if( $stemma ) {
 	   	$c->stash->{svg} = $stemma->as_svg;
 	}
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index d0d0385..b8265e0 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -37,6 +37,8 @@ Options include:
 
 =item is_lacuna - The 'reading' represents a known gap in the text.
 
+=item is_ph - A temporary placeholder for apparatus parsing purposes.  Do not use unless you know what you are doing.
+
 =item rank - The sequence number of the reading. This should probably not be set manually.
 
 =back
@@ -98,6 +100,12 @@ has 'is_lacuna' => (
     isa => 'Bool',
 	default => undef,
     );
+    
+has 'is_ph' => (
+	is => 'ro',
+	isa => 'Bool',
+	default => undef,
+	);
 
 has 'rank' => (
     is => 'rw',
@@ -128,6 +136,8 @@ around BUILDARGS => sub {
 	} elsif( exists $args->{'is_end'} ) {
 		$args->{'id'} = '#END#';	# Change the ID to ensure we have only one
 		$args->{'text'} = '#END#';
+	} elsif( exists $args->{'is_ph'} ) {
+		$args->{'text'} = $args->{'id'};
 	}
 	
 	$class->$orig( $args );
@@ -143,7 +153,7 @@ of text found in a witness.
 
 sub is_meta {
 	my $self = shift;
-	return $self->is_start || $self->is_end || $self->is_lacuna;	
+	return $self->is_start || $self->is_end || $self->is_lacuna || $self->is_ph;	
 }
 
 # Some syntactic sugar
diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm
index b21da1b..c4877cc 100644
--- a/lib/Text/Tradition/Parser/CTE.pm
+++ b/lib/Text/Tradition/Parser/CTE.pm
@@ -32,6 +32,9 @@ initializes the Tradition from the file.
 my %sigil_for;  # Save the XML IDs for witnesses.
 my %apps;       # Save the apparatus XML for a given ID.    
 my %has_ac;     # Keep track of witnesses that have corrections.
+my %group_sigla = (   ## HACK HACK HACK
+	'#M38' => [ qw( #M23 #M24 #M25 #M27 #M30 #M26 #M31 #M32 #M33 ) ], # l -> L*
+	);
 
 sub parse {
 	my( $tradition, $opts ) = @_;
@@ -60,8 +63,10 @@ sub parse {
 		my $id= $wit_el->getAttribute( 'xml:id' );
 		my @sig_parts = $xpc->findnodes( './abbr/descendant::text()', $wit_el );
 		my $sig = _stringify_sigil( @sig_parts );
-		$tradition->add_witness( sigil => $sig, source => $wit_el->toString() );
-		$sigil_for{'#'.$id} = $sig;  # Make life easy by keying on the ID ref syntax
+		unless( exists $group_sigla{'#'.$id} ) {  ## More HACKY
+			$tradition->add_witness( sigil => $sig, source => $wit_el->toString() );
+			$sigil_for{'#'.$id} = $sig;  # Make life easy by keying on the ID ref syntax
+		}
 	}
 
 	# Now go through the text and find the base tokens, apparatus tags, and
@@ -85,15 +90,14 @@ sub parse {
 	foreach my $item ( @base_text ) {
 	    my $r;
         if( $item->{'type'} eq 'token' ) {
-            $r = $c->add_reading( 'n'.$counter++ );
-            $r->text( $item->{'content'} );
+            $r = $c->add_reading( { id => 'n'.$counter++, 
+            						text => $item->{'content'} } );
         } elsif ( $item->{'type'} eq 'anchor' ) {
-            $r = $c->add_reading( '#ANCHOR_' . $item->{'content'} . '#' );
-            $r->is_meta(1);
+            $r = $c->add_reading( { id => '#ANCHOR_' . $item->{'content'} . '#', 
+            						is_ph => 1 } );
         } elsif ( $item->{'type'} eq 'app' ) {
             my $tag = '#APP_' . $counter++ . '#';
-            $r = $c->add_reading( $tag );
-            $r->is_meta(1);
+            $r = $c->add_reading( { id => $tag, is_ph => 1 } );
             $apps{$tag} = $item->{'content'};
         }
         $c->add_path( $last, $r, $c->baselabel );
@@ -169,29 +173,33 @@ sub _add_readings {
     my $ctr = 0;
     my $tag = $app_id;
     $tag =~ s/^\#APP_(.*)\#$/$1/;
-    $DB::single = 1 if $tag < 2;
     foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) {
         my @text;
         foreach ( $rdg->childNodes ) {
             push( @text, _get_base( $_ ) );
         }
-        my $interpreted = @text 
-            ? interpret( join( ' ', map { $_->{'content'} } @text ), $lemma_str ) 
-            : '';
+        my( $interpreted, $flag ) = ( '', undef );
+        if( @text ) {
+        	( $interpreted, $flag ) = interpret( 
+        		join( ' ', map { $_->{'content'} } @text ), $lemma_str );
+        }
         my @rdg_nodes;
         foreach my $w ( split( /\s+/, $interpreted ) ) {
-            my $r = $c->add_reading( $tag . "/" . $ctr++ );
-            $r->text( $w );
+            my $r = $c->add_reading( { id => $tag . "/" . $ctr++,
+            						   text => $w } );
             push( @rdg_nodes, $r );
         }
         
         # For each listed wit, save the reading.
         foreach my $wit ( split( /\s+/, $rdg->getAttribute( 'wit' ) ) ) {
+			$wit .= $flag if $flag;
             $wit_rdgs{$wit} = \@rdg_nodes;
         }
+        		
         # Does the reading have an ID? If so it probably has a witDetail
         # attached, and we need to read it.
         if( $rdg->hasAttribute( 'xml:id' ) ) {
+        	warn "Witdetail on meta reading" if $flag; # this could get complicated.
             my $rid = $rdg->getAttribute( 'xml:id' );
             my $xpc = XML::LibXML::XPathContext->new( $xn );
             my @details = $xpc->findnodes( './witDetail[@target="'.$rid.'"]' );
@@ -202,7 +210,24 @@ sub _add_readings {
     }       
         
     # Now collate the variant readings, since it is not done for us.
-    collate_variants( $c, \@lemma, values %wit_rdgs );    
+    collate_variants( $c, \@lemma, values %wit_rdgs );
+    
+    # HACKY HACKY Expand "group" sigla.
+    # Does not work for nested groups; also does not work with a modifier
+    # on the group sigil.
+    foreach my $wit_id ( keys %group_sigla ) {
+    	if ( exists $wit_rdgs{$wit_id} ) {
+    		my $rdg = $wit_rdgs{$wit_id};
+    		foreach my $w ( @{$group_sigla{$wit_id}} ) {
+    			if( exists $wit_rdgs{$w} ) {
+    				$DB::single = 1;
+					warn "Had reading for individual member $w of group $wit_id at $xn";
+    			}
+    			$wit_rdgs{$w} = $rdg;
+    		}
+    		delete $wit_rdgs{$wit_id};
+    	}
+    }
     
     # Now add the witness paths for each reading.
     foreach my $wit_id ( keys %wit_rdgs ) {
@@ -220,10 +245,9 @@ sub _anchor_name {
 
 sub _return_lemma {
     my( $c, $app, $anchor ) = @_;
-    my $app_node = $c->graph->node( $app );
-    my $anchor_node = $c->graph->node( $anchor );
-    my @nodes = grep { $_->name !~ /^\#A(PP|NCHOR)/ } 
-        $c->reading_sequence( $app_node, $anchor_node, $c->baselabel );
+    my @nodes = grep { $_->id !~ /^\#A(PP|NCHOR)/ } 
+        $c->reading_sequence( $c->reading( $app ), $c->reading( $anchor ),
+        	$c->baselabel );
     return @nodes;
 }
 
@@ -234,26 +258,34 @@ sub interpret {
 	my $oldreading = $reading;
 	# $lemma =~ s/\s+[[:punct:]]+$//;
 	# $reading =~ s/\s*\(?sic([\s\w.]+)?\)?$//;
+	my $flag;  # In case of p.c. indications
 	my @words = split( /\s+/, $lemma );
 	if( $reading =~ /^(.*) praem.$/ ) {
 		$reading = "$1 $lemma";
 	} elsif( $reading =~ /^(.*) add.$/ ) {
 		$reading = "$lemma $1";
+	} elsif( $reading =~ /add. alia manu/ ) {
+		# Ignore it.
+		$reading = $lemma;
 	} elsif( $reading eq 'om.' 
 	    || $reading =~ /locus [uv]acuus/
 	    || $reading =~ /inscriptionem compegi e/ # TODO huh?
 	    || $reading eq 'def.' # TODO huh?
 	    ) {
 		$reading = '';
-	} elsif( $reading eq 'inv.' ) {
+	} elsif( $reading =~ /^in[uv]\.$/ ) {
 		# Hope it is two words.
 		print STDERR "WARNING: want to invert a lemma that is not two words\n" 
 			unless scalar( @words ) == 2;
 		$reading = join( ' ', reverse( @words ) );
-	} elsif( $reading eq 'iter.' ) {
+	} elsif( $reading =~ /^iter(\.|at)$/ ) {
 		# Repeat the lemma
 		$reading = "$lemma $lemma";
-	 } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) {
+	} elsif( $reading eq 'in marg.' ) {
+		# There was nothing before a correction.
+		$reading = '';
+		$flag = '_ac';
+	} elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) {
 		# The first and last N words captured should replace the first and
 		# last N words of the lemma.
 		my @begin = split( /\s+/, $1 );
@@ -267,8 +299,12 @@ sub interpret {
 			$reading = join( ' ', @words );
 		}
 	}
-	print STDERR "Interpreted $oldreading as $reading given $lemma\n";
-	return $reading;
+	if( $oldreading ne $reading || $flag || $oldreading =~ /\./ ) {
+		my $int = $reading;
+		$int .= " ($flag)" if $flag;
+		print STDERR "Interpreted $oldreading as $int given $lemma\n";
+	}
+	return( $reading, $flag );
 }
 
 sub _parse_wit_detail {
@@ -305,22 +341,22 @@ sub expand_all_paths {
     # Walk the collation and fish out the paths for each witness
     foreach my $wit ( $c->tradition->witnesses ) {
         my $sig = $wit->sigil;
-        my @path = grep { $_->name !~ /(APP|ANCHOR)/ } 
+        my @path = grep { !$_->is_ph } 
             $c->reading_sequence( $c->start, $c->end, $sig );
         $wit->path( \@path );
         if( $has_ac{$sig} ) {
-            my @ac_path = grep { $_->name !~ /(APP|ANCHOR)/ } 
+            my @ac_path = grep { !$_->is_ph } 
                 $c->reading_sequence( $c->start, $c->end, $sig.$c->ac_label, $sig );
             $wit->uncorrected_path( \@ac_path );
         }
     }   
     
     # Delete the anchors
-    foreach my $anchor ( grep { $_->name =~ /(APP|ANCHOR)/ } $c->readings ) {
+    foreach my $anchor ( grep { $_->is_ph } $c->readings ) {
         $c->del_reading( $anchor );
     }
-    # Delete all edges
-    map { $c->del_path( $_ ) } $c->paths;
+    # Delete the base edges
+    map { $c->del_path( $_, $c->baselabel ) } $c->paths;
     
     # Make the path edges
     $c->make_witness_paths();