[urisagit/File-ReadBackwards.git] / ReadBackwards.pm

# File::ReadBackwards.pm

# Copyright (C) 2003 by Uri Guttman. All rights reserved.
# mail bugs, comments and feedback to uri@stemsystems.com

package File::ReadBackwards ;

use strict ;

use vars qw( $VERSION ) ;

$VERSION = '1.04' ;

use Symbol ;
use Fcntl qw( :seek O_RDONLY ) ;
use Carp ;

my $max_read_size = 1 << 13 ;

my $default_rec_sep ;

BEGIN {

# set the default record separator according to this OS
# this needs testing and expansion.

# look for CR/LF types
# then look for CR types
# else it's a LF type

	if ( $^O =~ /win32/i || $^O =~ /vms/i ) {

		$default_rec_sep = "\015\012" ;
	}
	elsif ( $^O =~ /mac/i ) {

		$default_rec_sep = "\015" ;
	}
	else {
		$default_rec_sep = "\012" ;
	}

# the tied interface is exactly the same as the object one, so all we
# need to do is to alias the subs with typeglobs

	*TIEHANDLE = \&new ;
	*READLINE = \&readline ;
 	*EOF = \&eof ;
 	*CLOSE = \&close ;
 	*TELL = \&tell ;

# added getline alias for compatibility with IO::Handle

	*getline = \&readline ;
}


# constructor for File::ReadBackwards

sub new {

	my( $class, $filename, $rec_sep, $sep_is_regex ) = @_ ;

# check that we have a filename

	defined( $filename ) || return ;

# see if this file uses the default of a cr/lf separator
# those files will get cr/lf converted to \n

	$rec_sep ||= $default_rec_sep ;
	my $is_crlf = $rec_sep eq "\015\012" ;

# get a handle and open the file

	my $handle = gensym ;
	sysopen( $handle, $filename, O_RDONLY ) || return ;
	binmode $handle ;

# seek to the end of the file and get its size

	my $seek_pos = sysseek( $handle, 0, SEEK_END ) or return ;

# get the size of the first block to read,
# either a trailing partial one (the % size) or full sized one (max read size)

	my $read_size = $seek_pos % $max_read_size || $max_read_size ;

# create the object

	my $self = bless {
			'file_name'	=> $filename,
			'handle'	=> $handle,
			'read_size'	=> $read_size,
			'seek_pos'	=> $seek_pos,
			'lines'		=> [],
			'is_crlf'	=> $is_crlf,
			'rec_sep'	=> $rec_sep,
			'sep_is_regex'	=> $sep_is_regex,

		}, $class ;

	return( $self ) ;
}

# read the previous record from the file
# 
sub readline {

	my( $self, $line_ref ) = @_ ;

	my $read_buf ;

# get the buffer of lines

	my $lines_ref = $self->{'lines'} ;

	return unless $lines_ref ;

	while( 1 ) {

# see if there is more than 1 line in the buffer

		if ( @{$lines_ref} > 1 ) {

# we have a complete line so return it
# and convert those damned cr/lf lines to \n

			$lines_ref->[-1] =~ s/\015\012/\n/
					if $self->{'is_crlf'} ;

			return( pop @{$lines_ref} ) ;
		}

# we don't have a complete, so have to read blocks until we do

		my $seek_pos = $self->{'seek_pos'} ;

# see if we are at the beginning of the file

		if ( $seek_pos == 0 ) {

# the last read never made more lines, so return the last line in the buffer
# if no lines left then undef will be returned
# and convert those damned cr/lf lines to \n

			$lines_ref->[-1] =~ s/\015\012/\n/
					if @{$lines_ref} && $self->{'is_crlf'} ;

			return( pop @{$lines_ref} ) ;
		}

# we have to read more text so get the handle and the current read size

		my $handle = $self->{'handle'} ;
		my $read_size = $self->{'read_size'} ;

# after the first read, always read the maximum size

		$self->{'read_size'} = $max_read_size ;

# seek to the beginning of this block and save the new seek position

		$seek_pos -= $read_size ;
		sysseek( $handle, $seek_pos, SEEK_SET ) ;
		$self->{'seek_pos'} = $seek_pos ;

# read in the next (previous) block of text

		my $read_cnt = sysread( $handle, $read_buf, $read_size ) ;

# prepend the read buffer to the leftover (possibly partial) line

		my $text = $read_buf ;
		$text .= shift @{$lines_ref} if @{$lines_ref} ;

# split the buffer into a list of lines
# this may want to be $/ but reading files backwards assumes plain text and
# newline separators

		@{$lines_ref} = ( $self->{'sep_is_regex'} ) ?
	 		$text =~ /(.*?$self->{'rec_sep'}|.+)/gs :
			$text =~ /(.*?\Q$self->{'rec_sep'}\E|.+)/gs ;

#print "Lines \n=>", join( "<=\n=>", @{$lines_ref} ), "<=\n" ;

	}
}

sub eof {

	my ( $self ) = @_ ;

	my $seek_pos = $self->{'seek_pos'} ;
	my $lines_count = @{ $self->{'lines'} } ;
	return( $seek_pos == 0 && $lines_count == 0 ) ;
}

sub tell {
	my ( $self ) = @_ ;

	my $seek_pos = $self->{'seek_pos'} ;
	$seek_pos + length(join "", @{ $self->{'lines'} });
}

sub get_handle {
	my ( $self ) = @_ ;

	my $handle = $self->{handle} ;
	seek( $handle, $self->tell, SEEK_SET ) ;
	return $handle ;
}

sub close {

	my ( $self ) = @_ ;

	my $handle = delete( $self->{'handle'} ) ;
	delete( $self->{'lines'} ) ;

	CORE::close( $handle ) ;
}

__END__


=head1 NAME

File::ReadBackwards.pm -- Read a file backwards by lines.
 

=head1 SYNOPSIS

    use File::ReadBackwards ;

    # Object interface

    $bw = File::ReadBackwards->new( 'log_file' ) or
			die "can't read 'log_file' $!" ;

    while( defined( $log_line = $bw->readline ) ) {
	    print $log_line ;
    }

    # ... or the alternative way of reading

    until ( $bw->eof ) {
	    print $bw->readline ;
    }

    # Tied Handle Interface

    tie *BW, 'File::ReadBackwards', 'log_file' or
			die "can't read 'log_file' $!" ;

    while( <BW> ) {
	    print ;
    }

=head1 DESCRIPTION
  

This module reads a file backwards line by line. It is simple to use,
memory efficient and fast. It supports both an object and a tied handle
interface.

It is intended for processing log and other similar text files which
typically have their newest entries appended to them. By default files
are assumed to be plain text and have a line ending appropriate to the
OS. But you can set the input record separator string on a per file
basis.


=head1 OBJECT INTERFACE
 
These are the methods in C<File::ReadBackwards>' object interface:


=head2 new( $file, [$rec_sep], [$sep_is_regex] )

C<new> takes as arguments a filename, an optional record separator and
an optional flag that marks the record separator as a regular
expression. It either returns the object on a successful open or undef
upon failure. $! is set to the error code if any.

=head2 readline

C<readline> takes no arguments and it returns the previous line in the
file or undef when there are no more lines in the file. If the file is
a non-seekable file (e.g. a pipe), then undef is returned.

=head2 getline

C<getline> is an alias for the readline method. It is here for
compatibilty with the IO::* classes which has a getline method.

=head2 eof

C<eof> takes no arguments and it returns true when readline() has
iterated through the whole file.

=head2 close

C<close> takes no arguments and it closes the handle

=head2 tell

C<tell> takes no arguments and it returns the current filehandle position.
This value may be used to seek() back to this position using a normal
file handle.

=head2 get_handle

C<get_handle> takes no arguments and it returns the internal Perl
filehandle used by the File::ReadBackwards object.  This handle may be
used to read the file forward. Its seek position will be set to the
position that is returned by the tell() method.  Note that
interleaving forward and reverse reads may produce unpredictable
results.  The only use supported at present is to read a file backward
to a certain point, then use 'handle' to extract the handle, and read
forward from that point.

=head1 TIED HANDLE INTERFACE

=head2 tie( *HANDLE, 'File::ReadBackwards', $file, [$rec_sep], [$sep_is_regex] )
 

The TIEHANDLE, READLINE, EOF, CLOSE and TELL methods are aliased to
the new, readline, eof, close and tell methods respectively so refer
to them for their arguments and API.  Once you have tied a handle to
File::ReadBackwards the only I/O operation permissible is <> which
will read the previous line. You can call eof() and close() on the
tied handle as well. All other tied handle operations will generate an
unknown method error. Do not seek, write or perform any other
unsupported operations on the tied handle.

=head1 LINE AND RECORD ENDINGS
 

Since this module needs to use low level I/O for efficiency, it can't
portably seek and do block I/O without managing line ending conversions.
This module supports the default record separators of normal line ending
strings used by the OS. You can also set the separator on a per file
basis.

The record separator is a regular expression by default, which differs
from the behavior of $/.

Only if the record separator is B<not> specified and it defaults to
CR/LF (e.g, VMS, redmondware) will it will be converted to a single
newline. Unix and MacOS files systems use only a single character for
line endings and the lines are left unchanged.  This means that for
native text files, you should be able to process their lines backwards
without any problems with line endings. If you specify a record
separator, no conversions will be done and you will get the records as
if you read them in binary mode.

=head1 DESIGN

It works by reading a large (8kb) block of data from the end of the
file.  It then splits them on the record separator and stores a list of
records in the object. Each call to readline returns the top record of
the list and if the list is empty it refills it by reading the previous
block from the file and splitting it.  When the beginning of the file is
reached and there are no more lines, undef is returned.  All boundary
conditions are handled correctly i.e. if there is a trailing partial
line (no newline) it will be the first line returned and lines larger
than the read buffer size are handled properly.


=head1 NOTES
 

There is no support for list context in either the object or tied
interfaces. If you want to slurp all of the lines into an array in
backwards order (and you don't care about memory usage) just do:

	@back_lines = reverse <FH>.

This module is only intended to read one line at a time from the end of
a file to the beginning.

=head1 AUTHOR
 

Uri Guttman, uri@stemsystems.com

=head1 COPYRIGHT
 

Copyright (C) 2003 by Uri Guttman. All rights reserved.  This program is
free software; you can redistribute it and/or modify it under the same
terms as Perl itself.

=cut
Commit	Line	Data
0308c4d5	1	# File::ReadBackwards.pm
	2
	3	# Copyright (C) 2003 by Uri Guttman. All rights reserved.
	4	# mail bugs, comments and feedback to uri@stemsystems.com
	5
	6	package File::ReadBackwards ;
	7
	8	use strict ;
	9
	10	use vars qw( $VERSION ) ;
	11
	12	$VERSION = '1.04' ;
	13
	14	use Symbol ;
	15	use Fcntl qw( :seek O_RDONLY ) ;
	16	use Carp ;
	17
	18	my $max_read_size = 1 << 13 ;
	19
	20	my $default_rec_sep ;
	21
	22	BEGIN {
	23
	24	# set the default record separator according to this OS
	25	# this needs testing and expansion.
	26
	27	# look for CR/LF types
	28	# then look for CR types
	29	# else it's a LF type
	30
	31	if ( $^O =~ /win32/i \|\| $^O =~ /vms/i ) {
	32
	33	$default_rec_sep = "\015\012" ;
	34	}
	35	elsif ( $^O =~ /mac/i ) {
	36
	37	$default_rec_sep = "\015" ;
	38	}
	39	else {
	40	$default_rec_sep = "\012" ;
	41	}
	42
	43	# the tied interface is exactly the same as the object one, so all we
	44	# need to do is to alias the subs with typeglobs
	45
	46	*TIEHANDLE = \&new ;
	47	*READLINE = \&readline ;
	48	*EOF = \&eof ;
	49	*CLOSE = \&close ;
	50	*TELL = \&tell ;
	51
	52	# added getline alias for compatibility with IO::Handle
	53
	54	*getline = \&readline ;
	55	}
	56
	57
	58	# constructor for File::ReadBackwards
	59
	60	sub new {
	61
	62	my( $class, $filename, $rec_sep, $sep_is_regex ) = @_ ;
	63
	64	# check that we have a filename
65
66	defined( $filename ) \|\| return ;
67
68	# see if this file uses the default of a cr/lf separator
69	# those files will get cr/lf converted to \n
70
71	$rec_sep \|\|= $default_rec_sep ;
72	my $is_crlf = $rec_sep eq "\015\012" ;
73
74	# get a handle and open the file
75
76	my $handle = gensym ;
77	sysopen( $handle, $filename, O_RDONLY ) \|\| return ;
78	binmode $handle ;
79
80	# seek to the end of the file and get its size
81
82	my $seek_pos = sysseek( $handle, 0, SEEK_END ) or return ;
83
84	# get the size of the first block to read,
85	# either a trailing partial one (the % size) or full sized one (max read size)
86
87	my $read_size = $seek_pos % $max_read_size \|\| $max_read_size ;
88
89	# create the object
90
91	my $self = bless {
92	'file_name' => $filename,
93	'handle' => $handle,
94	'read_size' => $read_size,
95	'seek_pos' => $seek_pos,
96	'lines' => [],
97	'is_crlf' => $is_crlf,
98	'rec_sep' => $rec_sep,
99	'sep_is_regex' => $sep_is_regex,
100
101	}, $class ;
102
103	return( $self ) ;
104	}
105
106	# read the previous record from the file
107	#
108	sub readline {
109
110	my( $self, $line_ref ) = @_ ;
111
112	my $read_buf ;
113
114	# get the buffer of lines
115
116	my $lines_ref = $self->{'lines'} ;
117
118	return unless $lines_ref ;
119
120	while( 1 ) {
121
122	# see if there is more than 1 line in the buffer
123
124	if ( @{$lines_ref} > 1 ) {
125
126	# we have a complete line so return it
127	# and convert those damned cr/lf lines to \n
128
129	$lines_ref->[-1] =~ s/\015\012/\n/
130	if $self->{'is_crlf'} ;
131
132	return( pop @{$lines_ref} ) ;
133	}
134
135	# we don't have a complete, so have to read blocks until we do
136
137	my $seek_pos = $self->{'seek_pos'} ;
138
139	# see if we are at the beginning of the file
140
141	if ( $seek_pos == 0 ) {
142
143	# the last read never made more lines, so return the last line in the buffer
144	# if no lines left then undef will be returned
145	# and convert those damned cr/lf lines to \n
146
147	$lines_ref->[-1] =~ s/\015\012/\n/
148	if @{$lines_ref} && $self->{'is_crlf'} ;
149
150	return( pop @{$lines_ref} ) ;
151	}
152
153	# we have to read more text so get the handle and the current read size
154
155	my $handle = $self->{'handle'} ;
156	my $read_size = $self->{'read_size'} ;
157
158	# after the first read, always read the maximum size
159
160	$self->{'read_size'} = $max_read_size ;
161
162	# seek to the beginning of this block and save the new seek position
163
164	$seek_pos -= $read_size ;
165	sysseek( $handle, $seek_pos, SEEK_SET ) ;
166	$self->{'seek_pos'} = $seek_pos ;
167
168	# read in the next (previous) block of text
169
170	my $read_cnt = sysread( $handle, $read_buf, $read_size ) ;
171
172	# prepend the read buffer to the leftover (possibly partial) line
173
174	my $text = $read_buf ;
175	$text .= shift @{$lines_ref} if @{$lines_ref} ;
176
177	# split the buffer into a list of lines
178	# this may want to be $/ but reading files backwards assumes plain text and
179	# newline separators
180
181	@{$lines_ref} = ( $self->{'sep_is_regex'} ) ?
182	$text =~ /(.*?$self->{'rec_sep'}\|.+)/gs :
183	$text =~ /(.*?\Q$self->{'rec_sep'}\E\|.+)/gs ;
184
185	#print "Lines \n=>", join( "<=\n=>", @{$lines_ref} ), "<=\n" ;
186
187	}
188	}
189
190	sub eof {
191
192	my ( $self ) = @_ ;
193
194	my $seek_pos = $self->{'seek_pos'} ;
195	my $lines_count = @{ $self->{'lines'} } ;
196	return( $seek_pos == 0 && $lines_count == 0 ) ;
197	}
198
199	sub tell {
200	my ( $self ) = @_ ;
201
202	my $seek_pos = $self->{'seek_pos'} ;
203	$seek_pos + length(join "", @{ $self->{'lines'} });
204	}
205
206	sub get_handle {
207	my ( $self ) = @_ ;
208
209	my $handle = $self->{handle} ;
210	seek( $handle, $self->tell, SEEK_SET ) ;
211	return $handle ;
212	}
213
214	sub close {
215
216	my ( $self ) = @_ ;
217
218	my $handle = delete( $self->{'handle'} ) ;
219	delete( $self->{'lines'} ) ;
220
221	CORE::close( $handle ) ;
222	}
223
224	__END__
225
226
227	=head1 NAME
228
229	File::ReadBackwards.pm -- Read a file backwards by lines.
230
231
232	=head1 SYNOPSIS
233
234	use File::ReadBackwards ;
235
236	# Object interface
237
238	$bw = File::ReadBackwards->new( 'log_file' ) or
239	die "can't read 'log_file' $!" ;
240
241	while( defined( $log_line = $bw->readline ) ) {
242	print $log_line ;
243	}
244
245	# ... or the alternative way of reading
246
247	until ( $bw->eof ) {
248	print $bw->readline ;
249	}
250
251	# Tied Handle Interface
252
253	tie *BW, 'File::ReadBackwards', 'log_file' or
254	die "can't read 'log_file' $!" ;
255
256	while( <BW> ) {
257	print ;
258	}
259
260	=head1 DESCRIPTION
261
262
263	This module reads a file backwards line by line. It is simple to use,
264	memory efficient and fast. It supports both an object and a tied handle
265	interface.
266
267	It is intended for processing log and other similar text files which
268	typically have their newest entries appended to them. By default files
269	are assumed to be plain text and have a line ending appropriate to the
270	OS. But you can set the input record separator string on a per file
271	basis.
272
273
274	=head1 OBJECT INTERFACE
275
276	These are the methods in C<File::ReadBackwards>' object interface:
277
278
279	=head2 new( $file, [$rec_sep], [$sep_is_regex] )
280
281	C<new> takes as arguments a filename, an optional record separator and
282	an optional flag that marks the record separator as a regular
283	expression. It either returns the object on a successful open or undef
284	upon failure. $! is set to the error code if any.
285
286	=head2 readline
287
288	C<readline> takes no arguments and it returns the previous line in the
289	file or undef when there are no more lines in the file. If the file is
290	a non-seekable file (e.g. a pipe), then undef is returned.
291
292	=head2 getline
293
294	C<getline> is an alias for the readline method. It is here for
295	compatibilty with the IO::* classes which has a getline method.
296
297	=head2 eof
298
299	C<eof> takes no arguments and it returns true when readline() has
300	iterated through the whole file.
301
302	=head2 close
303
304	C<close> takes no arguments and it closes the handle
305
306	=head2 tell
307
308	C<tell> takes no arguments and it returns the current filehandle position.
309	This value may be used to seek() back to this position using a normal
310	file handle.
311
312	=head2 get_handle
313
314	C<get_handle> takes no arguments and it returns the internal Perl
315	filehandle used by the File::ReadBackwards object. This handle may be
316	used to read the file forward. Its seek position will be set to the
317	position that is returned by the tell() method. Note that
318	interleaving forward and reverse reads may produce unpredictable
319	results. The only use supported at present is to read a file backward
320	to a certain point, then use 'handle' to extract the handle, and read
321	forward from that point.
322
323	=head1 TIED HANDLE INTERFACE
324
325	=head2 tie( *HANDLE, 'File::ReadBackwards', $file, [$rec_sep], [$sep_is_regex] )
326
327
328	The TIEHANDLE, READLINE, EOF, CLOSE and TELL methods are aliased to
329	the new, readline, eof, close and tell methods respectively so refer
330	to them for their arguments and API. Once you have tied a handle to
331	File::ReadBackwards the only I/O operation permissible is <> which
332	will read the previous line. You can call eof() and close() on the
333	tied handle as well. All other tied handle operations will generate an
334	unknown method error. Do not seek, write or perform any other
335	unsupported operations on the tied handle.
336
337	=head1 LINE AND RECORD ENDINGS
338
339
340	Since this module needs to use low level I/O for efficiency, it can't
341	portably seek and do block I/O without managing line ending conversions.
342	This module supports the default record separators of normal line ending
343	strings used by the OS. You can also set the separator on a per file
344	basis.
345
346	The record separator is a regular expression by default, which differs
347	from the behavior of $/.
348
349	Only if the record separator is B<not> specified and it defaults to
350	CR/LF (e.g, VMS, redmondware) will it will be converted to a single
351	newline. Unix and MacOS files systems use only a single character for
352	line endings and the lines are left unchanged. This means that for
353	native text files, you should be able to process their lines backwards
354	without any problems with line endings. If you specify a record
355	separator, no conversions will be done and you will get the records as
356	if you read them in binary mode.
357
358	=head1 DESIGN
359
360	It works by reading a large (8kb) block of data from the end of the
361	file. It then splits them on the record separator and stores a list of
362	records in the object. Each call to readline returns the top record of
363	the list and if the list is empty it refills it by reading the previous
364	block from the file and splitting it. When the beginning of the file is
365	reached and there are no more lines, undef is returned. All boundary
366	conditions are handled correctly i.e. if there is a trailing partial
367	line (no newline) it will be the first line returned and lines larger
368	than the read buffer size are handled properly.
369
370
371	=head1 NOTES
372
373
374	There is no support for list context in either the object or tied
375	interfaces. If you want to slurp all of the lines into an array in
376	backwards order (and you don't care about memory usage) just do:
377
378	@back_lines = reverse <FH>.
379
380	This module is only intended to read one line at a time from the end of
381	a file to the beginning.
382
383	=head1 AUTHOR
384
385
386	Uri Guttman, uri@stemsystems.com
387
388	=head1 COPYRIGHT
389
390
391	Copyright (C) 2003 by Uri Guttman. All rights reserved. This program is
392	free software; you can redistribute it and/or modify it under the same
393	terms as Perl itself.
394
395	=cut