Commit | Line | Data |
0308c4d5 |
1 | # File::ReadBackwards.pm |
2 | |
3 | # Copyright (C) 2003 by Uri Guttman. All rights reserved. |
4 | # mail bugs, comments and feedback to uri@stemsystems.com |
5 | |
6 | package File::ReadBackwards ; |
7 | |
8 | use strict ; |
9 | |
10 | use vars qw( $VERSION ) ; |
11 | |
12 | $VERSION = '1.04' ; |
13 | |
14 | use Symbol ; |
15 | use Fcntl qw( :seek O_RDONLY ) ; |
16 | use Carp ; |
17 | |
18 | my $max_read_size = 1 << 13 ; |
19 | |
20 | my $default_rec_sep ; |
21 | |
22 | BEGIN { |
23 | |
24 | # set the default record separator according to this OS |
25 | # this needs testing and expansion. |
26 | |
27 | # look for CR/LF types |
28 | # then look for CR types |
29 | # else it's a LF type |
30 | |
31 | if ( $^O =~ /win32/i || $^O =~ /vms/i ) { |
32 | |
33 | $default_rec_sep = "\015\012" ; |
34 | } |
35 | elsif ( $^O =~ /mac/i ) { |
36 | |
37 | $default_rec_sep = "\015" ; |
38 | } |
39 | else { |
40 | $default_rec_sep = "\012" ; |
41 | } |
42 | |
43 | # the tied interface is exactly the same as the object one, so all we |
44 | # need to do is to alias the subs with typeglobs |
45 | |
46 | *TIEHANDLE = \&new ; |
47 | *READLINE = \&readline ; |
48 | *EOF = \&eof ; |
49 | *CLOSE = \&close ; |
50 | *TELL = \&tell ; |
51 | |
52 | # added getline alias for compatibility with IO::Handle |
53 | |
54 | *getline = \&readline ; |
55 | } |
56 | |
57 | |
58 | # constructor for File::ReadBackwards |
59 | |
60 | sub new { |
61 | |
62 | my( $class, $filename, $rec_sep, $sep_is_regex ) = @_ ; |
63 | |
64 | # check that we have a filename |
65 | |
66 | defined( $filename ) || return ; |
67 | |
68 | # see if this file uses the default of a cr/lf separator |
69 | # those files will get cr/lf converted to \n |
70 | |
71 | $rec_sep ||= $default_rec_sep ; |
72 | my $is_crlf = $rec_sep eq "\015\012" ; |
73 | |
74 | # get a handle and open the file |
75 | |
76 | my $handle = gensym ; |
77 | sysopen( $handle, $filename, O_RDONLY ) || return ; |
78 | binmode $handle ; |
79 | |
80 | # seek to the end of the file and get its size |
81 | |
82 | my $seek_pos = sysseek( $handle, 0, SEEK_END ) or return ; |
83 | |
84 | # get the size of the first block to read, |
85 | # either a trailing partial one (the % size) or full sized one (max read size) |
86 | |
87 | my $read_size = $seek_pos % $max_read_size || $max_read_size ; |
88 | |
89 | # create the object |
90 | |
91 | my $self = bless { |
92 | 'file_name' => $filename, |
93 | 'handle' => $handle, |
94 | 'read_size' => $read_size, |
95 | 'seek_pos' => $seek_pos, |
96 | 'lines' => [], |
97 | 'is_crlf' => $is_crlf, |
98 | 'rec_sep' => $rec_sep, |
99 | 'sep_is_regex' => $sep_is_regex, |
100 | |
101 | }, $class ; |
102 | |
103 | return( $self ) ; |
104 | } |
105 | |
106 | # read the previous record from the file |
107 | # |
108 | sub readline { |
109 | |
110 | my( $self, $line_ref ) = @_ ; |
111 | |
112 | my $read_buf ; |
113 | |
114 | # get the buffer of lines |
115 | |
116 | my $lines_ref = $self->{'lines'} ; |
117 | |
118 | return unless $lines_ref ; |
119 | |
120 | while( 1 ) { |
121 | |
122 | # see if there is more than 1 line in the buffer |
123 | |
124 | if ( @{$lines_ref} > 1 ) { |
125 | |
126 | # we have a complete line so return it |
127 | # and convert those damned cr/lf lines to \n |
128 | |
129 | $lines_ref->[-1] =~ s/\015\012/\n/ |
130 | if $self->{'is_crlf'} ; |
131 | |
132 | return( pop @{$lines_ref} ) ; |
133 | } |
134 | |
135 | # we don't have a complete, so have to read blocks until we do |
136 | |
137 | my $seek_pos = $self->{'seek_pos'} ; |
138 | |
139 | # see if we are at the beginning of the file |
140 | |
141 | if ( $seek_pos == 0 ) { |
142 | |
143 | # the last read never made more lines, so return the last line in the buffer |
144 | # if no lines left then undef will be returned |
145 | # and convert those damned cr/lf lines to \n |
146 | |
147 | $lines_ref->[-1] =~ s/\015\012/\n/ |
148 | if @{$lines_ref} && $self->{'is_crlf'} ; |
149 | |
150 | return( pop @{$lines_ref} ) ; |
151 | } |
152 | |
153 | # we have to read more text so get the handle and the current read size |
154 | |
155 | my $handle = $self->{'handle'} ; |
156 | my $read_size = $self->{'read_size'} ; |
157 | |
158 | # after the first read, always read the maximum size |
159 | |
160 | $self->{'read_size'} = $max_read_size ; |
161 | |
162 | # seek to the beginning of this block and save the new seek position |
163 | |
164 | $seek_pos -= $read_size ; |
165 | sysseek( $handle, $seek_pos, SEEK_SET ) ; |
166 | $self->{'seek_pos'} = $seek_pos ; |
167 | |
168 | # read in the next (previous) block of text |
169 | |
170 | my $read_cnt = sysread( $handle, $read_buf, $read_size ) ; |
171 | |
172 | # prepend the read buffer to the leftover (possibly partial) line |
173 | |
174 | my $text = $read_buf ; |
175 | $text .= shift @{$lines_ref} if @{$lines_ref} ; |
176 | |
177 | # split the buffer into a list of lines |
178 | # this may want to be $/ but reading files backwards assumes plain text and |
179 | # newline separators |
180 | |
181 | @{$lines_ref} = ( $self->{'sep_is_regex'} ) ? |
182 | $text =~ /(.*?$self->{'rec_sep'}|.+)/gs : |
183 | $text =~ /(.*?\Q$self->{'rec_sep'}\E|.+)/gs ; |
184 | |
185 | #print "Lines \n=>", join( "<=\n=>", @{$lines_ref} ), "<=\n" ; |
186 | |
187 | } |
188 | } |
189 | |
190 | sub eof { |
191 | |
192 | my ( $self ) = @_ ; |
193 | |
194 | my $seek_pos = $self->{'seek_pos'} ; |
195 | my $lines_count = @{ $self->{'lines'} } ; |
196 | return( $seek_pos == 0 && $lines_count == 0 ) ; |
197 | } |
198 | |
199 | sub tell { |
200 | my ( $self ) = @_ ; |
201 | |
202 | my $seek_pos = $self->{'seek_pos'} ; |
203 | $seek_pos + length(join "", @{ $self->{'lines'} }); |
204 | } |
205 | |
206 | sub get_handle { |
207 | my ( $self ) = @_ ; |
208 | |
209 | my $handle = $self->{handle} ; |
210 | seek( $handle, $self->tell, SEEK_SET ) ; |
211 | return $handle ; |
212 | } |
213 | |
214 | sub close { |
215 | |
216 | my ( $self ) = @_ ; |
217 | |
218 | my $handle = delete( $self->{'handle'} ) ; |
219 | delete( $self->{'lines'} ) ; |
220 | |
221 | CORE::close( $handle ) ; |
222 | } |
223 | |
224 | __END__ |
225 | |
226 | |
227 | =head1 NAME |
228 | |
229 | File::ReadBackwards.pm -- Read a file backwards by lines. |
230 | |
231 | |
232 | =head1 SYNOPSIS |
233 | |
234 | use File::ReadBackwards ; |
235 | |
236 | # Object interface |
237 | |
238 | $bw = File::ReadBackwards->new( 'log_file' ) or |
239 | die "can't read 'log_file' $!" ; |
240 | |
241 | while( defined( $log_line = $bw->readline ) ) { |
242 | print $log_line ; |
243 | } |
244 | |
245 | # ... or the alternative way of reading |
246 | |
247 | until ( $bw->eof ) { |
248 | print $bw->readline ; |
249 | } |
250 | |
251 | # Tied Handle Interface |
252 | |
253 | tie *BW, 'File::ReadBackwards', 'log_file' or |
254 | die "can't read 'log_file' $!" ; |
255 | |
256 | while( <BW> ) { |
257 | print ; |
258 | } |
259 | |
260 | =head1 DESCRIPTION |
261 | |
262 | |
263 | This module reads a file backwards line by line. It is simple to use, |
264 | memory efficient and fast. It supports both an object and a tied handle |
265 | interface. |
266 | |
267 | It is intended for processing log and other similar text files which |
268 | typically have their newest entries appended to them. By default files |
269 | are assumed to be plain text and have a line ending appropriate to the |
270 | OS. But you can set the input record separator string on a per file |
271 | basis. |
272 | |
273 | |
274 | =head1 OBJECT INTERFACE |
275 | |
276 | These are the methods in C<File::ReadBackwards>' object interface: |
277 | |
278 | |
279 | =head2 new( $file, [$rec_sep], [$sep_is_regex] ) |
280 | |
281 | C<new> takes as arguments a filename, an optional record separator and |
282 | an optional flag that marks the record separator as a regular |
283 | expression. It either returns the object on a successful open or undef |
284 | upon failure. $! is set to the error code if any. |
285 | |
286 | =head2 readline |
287 | |
288 | C<readline> takes no arguments and it returns the previous line in the |
289 | file or undef when there are no more lines in the file. If the file is |
290 | a non-seekable file (e.g. a pipe), then undef is returned. |
291 | |
292 | =head2 getline |
293 | |
294 | C<getline> is an alias for the readline method. It is here for |
295 | compatibilty with the IO::* classes which has a getline method. |
296 | |
297 | =head2 eof |
298 | |
299 | C<eof> takes no arguments and it returns true when readline() has |
300 | iterated through the whole file. |
301 | |
302 | =head2 close |
303 | |
304 | C<close> takes no arguments and it closes the handle |
305 | |
306 | =head2 tell |
307 | |
308 | C<tell> takes no arguments and it returns the current filehandle position. |
309 | This value may be used to seek() back to this position using a normal |
310 | file handle. |
311 | |
312 | =head2 get_handle |
313 | |
314 | C<get_handle> takes no arguments and it returns the internal Perl |
315 | filehandle used by the File::ReadBackwards object. This handle may be |
316 | used to read the file forward. Its seek position will be set to the |
317 | position that is returned by the tell() method. Note that |
318 | interleaving forward and reverse reads may produce unpredictable |
319 | results. The only use supported at present is to read a file backward |
320 | to a certain point, then use 'handle' to extract the handle, and read |
321 | forward from that point. |
322 | |
323 | =head1 TIED HANDLE INTERFACE |
324 | |
325 | =head2 tie( *HANDLE, 'File::ReadBackwards', $file, [$rec_sep], [$sep_is_regex] ) |
326 | |
327 | |
328 | The TIEHANDLE, READLINE, EOF, CLOSE and TELL methods are aliased to |
329 | the new, readline, eof, close and tell methods respectively so refer |
330 | to them for their arguments and API. Once you have tied a handle to |
331 | File::ReadBackwards the only I/O operation permissible is <> which |
332 | will read the previous line. You can call eof() and close() on the |
333 | tied handle as well. All other tied handle operations will generate an |
334 | unknown method error. Do not seek, write or perform any other |
335 | unsupported operations on the tied handle. |
336 | |
337 | =head1 LINE AND RECORD ENDINGS |
338 | |
339 | |
340 | Since this module needs to use low level I/O for efficiency, it can't |
341 | portably seek and do block I/O without managing line ending conversions. |
342 | This module supports the default record separators of normal line ending |
343 | strings used by the OS. You can also set the separator on a per file |
344 | basis. |
345 | |
346 | The record separator is a regular expression by default, which differs |
347 | from the behavior of $/. |
348 | |
349 | Only if the record separator is B<not> specified and it defaults to |
350 | CR/LF (e.g, VMS, redmondware) will it will be converted to a single |
351 | newline. Unix and MacOS files systems use only a single character for |
352 | line endings and the lines are left unchanged. This means that for |
353 | native text files, you should be able to process their lines backwards |
354 | without any problems with line endings. If you specify a record |
355 | separator, no conversions will be done and you will get the records as |
356 | if you read them in binary mode. |
357 | |
358 | =head1 DESIGN |
359 | |
360 | It works by reading a large (8kb) block of data from the end of the |
361 | file. It then splits them on the record separator and stores a list of |
362 | records in the object. Each call to readline returns the top record of |
363 | the list and if the list is empty it refills it by reading the previous |
364 | block from the file and splitting it. When the beginning of the file is |
365 | reached and there are no more lines, undef is returned. All boundary |
366 | conditions are handled correctly i.e. if there is a trailing partial |
367 | line (no newline) it will be the first line returned and lines larger |
368 | than the read buffer size are handled properly. |
369 | |
370 | |
371 | =head1 NOTES |
372 | |
373 | |
374 | There is no support for list context in either the object or tied |
375 | interfaces. If you want to slurp all of the lines into an array in |
376 | backwards order (and you don't care about memory usage) just do: |
377 | |
378 | @back_lines = reverse <FH>. |
379 | |
380 | This module is only intended to read one line at a time from the end of |
381 | a file to the beginning. |
382 | |
383 | =head1 AUTHOR |
384 | |
385 | |
386 | Uri Guttman, uri@stemsystems.com |
387 | |
388 | =head1 COPYRIGHT |
389 | |
390 | |
391 | Copyright (C) 2003 by Uri Guttman. All rights reserved. This program is |
392 | free software; you can redistribute it and/or modify it under the same |
393 | terms as Perl itself. |
394 | |
395 | =cut |