X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCTE.pm;h=be6adfcdc9c6aa360a0154487831af0f71231b48;hb=222d58f10f00b766849be205554662b2bbcd29f9;hp=af989c7fadedb50d5973f72aa102ccba1042414b;hpb=c9158e60d47ce702857a268db2e337257adf619d;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm
index af989c7..be6adfc 100644
--- a/lib/Text/Tradition/Parser/CTE.pm
+++ b/lib/Text/Tradition/Parser/CTE.pm
@@ -51,7 +51,7 @@ sub parse {
 		my @sig_parts = $xpc->findnodes( 'descendant::text()', $wit_el );
 		my $sig = _stringify_sigil( @sig_parts );
 		print STDERR "Adding witness $sig\n";
-		$tradition->add_witness( sigil => $sig, source => $wit_el->toString() );
+		$tradition->add_witness( sigil => $sig, sourcetype => 'collation' );
 		$sigil_for{'#'.$id} = $sig;  # Make life easy by keying on the ID ref syntax
 	}
 	
@@ -78,10 +78,10 @@ sub parse {
             $r = $c->add_reading( { id => 'n'.$counter++, 
             						text => $item->{'content'} } );
         } elsif ( $item->{'type'} eq 'anchor' ) {
-            $r = $c->add_reading( { id => '#ANCHOR_' . $item->{'content'} . '#', 
+            $r = $c->add_reading( { id => '__ANCHOR_' . $item->{'content'} . '__', 
             						is_ph => 1 } );
         } elsif ( $item->{'type'} eq 'app' ) {
-            my $tag = '#APP_' . $counter++ . '#';
+            my $tag = '__APP_' . $counter++ . '__';
             $r = $c->add_reading( { id => $tag, is_ph => 1 } );
             $apps{$tag} = $item->{'content'};
         }
@@ -111,7 +111,9 @@ sub parse {
 sub _stringify_sigil {
     my( @nodes ) = @_;
     my @parts = grep { /\w/ } map { $_->data } @nodes;
-    return join( '', @parts );
+    my $whole = join( '', @parts );
+    $whole =~ s/\W//g;
+    return $whole;
 }
 
 # Get rid of all the formatting elements that get in the way of tokenization.
@@ -207,14 +209,14 @@ sub _add_readings {
     # Get the lemma, which is all the readings between app and anchor,
     # excluding other apps or anchors.
     my @lemma = _return_lemma( $c, $app_id, $anchor );
-    my $lemma_str = join( ' ', grep { $_ !~ /^\#/ } map { $_->text } @lemma );
+    my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma );
     
     # For each reading, send its text to 'interpret' along with the lemma,
     # and then save the list of witnesses that these tokens belong to.
     my %wit_rdgs;  # Maps from witnesses to the variant text
     my $ctr = 0;
     my $tag = $app_id;
-    $tag =~ s/^\#APP_(.*)\#$/$1/;
+    $tag =~ s/^\__APP_(.*)\__$/$1/;
 
     foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) {
         my @text;
@@ -230,11 +232,11 @@ sub _add_readings {
         
         my @rdg_nodes;
         if( $interpreted eq '#LACUNA#' ) {
-        	push( @rdg_nodes, $c->add_reading( { id => $tag . "/" . $ctr++,
+        	push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++,
         										 is_lacuna => 1 } ) );
         } else {
 			foreach my $w ( split( /\s+/, $interpreted ) ) {
-				my $r = $c->add_reading( { id => $tag . "/" . $ctr++,
+				my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++,
 										   text => $w } );
 				push( @rdg_nodes, $r );
 			}
@@ -259,7 +261,6 @@ sub _add_readings {
     }       
         
     # Now collate the variant readings, since it is not done for us.
-    $DB::single = 1 if @lemma > 10;
     collate_variants( $c, \@lemma, values %wit_rdgs );
         
     # Now add the witness paths for each reading.
@@ -273,12 +274,12 @@ sub _add_readings {
 sub _anchor_name {
     my $xmlid = shift;
     $xmlid =~ s/^\#//;
-    return sprintf( "#ANCHOR_%s#", $xmlid );
+    return sprintf( "__ANCHOR_%s__", $xmlid );
 }
 
 sub _return_lemma {
     my( $c, $app, $anchor ) = @_;
-    my @nodes = grep { $_->id !~ /^\#A(PP|NCHOR)/ } 
+    my @nodes = grep { $_->id !~ /^__A(PP|NCHOR)/ } 
         $c->reading_sequence( $c->reading( $app ), $c->reading( $anchor ),
         	$c->baselabel );
     return @nodes;