From: Uri Guttman Date: Fri, 31 Oct 2008 01:36:20 +0000 (-0500) Subject: initial commit X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=refs%2Fheads%2Fmaster;p=urisagit%2FSort-Maker.git initial commit --- 7468c58403ae8ecc1b2bdefba904be9e8198cfef diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d1f8c03 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +;; This buffer is for notes you don't want to save, and for Lisp evaluation. +;; If you want to create a file, visit that file with C-x C-f, +;; then enter the text in that file's own buffer. + +CVS +*.gz +blib +*.tar +old +*~ + diff --git a/Changes b/Changes new file mode 100644 index 0000000..2f2880e --- /dev/null +++ b/Changes @@ -0,0 +1,58 @@ +Revision history for Perl extension Sort::Maker. + +0.06 Fri Dec 29 00:47:00 EST 2006 + - renamed t/common.pl to t/common.pm + - added support for error tests in t/common.pm + - rewrote t/error.t to use test table and common.pm + - created proper README file + - fixed doc bug (removed 'style =>') + -from Mumia W.. + - added t/closure.t + -initial code from Mumia W.. + + - added support for closures for key extraction + -idea and initial code and pod from Mumia W.. + + +0.05 Thu Apr 6 03:15:09 EDT 2006 + - fixed parsing array ref key description + - added 2 tests to t/error.t + -from John Kent + - reordered Changes to be newer to older + +0.04 Wed Apr 27 01:37:00 EDT 2005 + - fixed many doc typos and mistakes + -from Brad Baxter + - fixed 'name' option bug, added test and docs + -from Damian Conway + - added support for code refs as extraction code + cleaned up the extraction code documentation + -from Damian Conway + +0.03 Tue Feb 8 03:45:32 EST 2005 + - fixed bug when using GRT and varying strings and ref_in. + new test - ref_in_varying.t + -from Brad Baxter + - fixed bug when using GRT and descending string sort with + numeric data. + new test - descending_grt_string.t + -from Brad Baxter + +0.02 Fri Aug 27 00:43:25 EDT 2004 + - fixed bug in decending integer/float GRT sorts (tests included) + -from Marc Prewitt + - cleaned up pod regarding 'fixed' and other attributes + -from Marc Prewitt + - fixed and cleaned up docs for init_code feature. now it is + documented to work only with ST and GRT + - the string_data attribute for GRT now works and there is the + string_data.t test for it. + - fixed bug with string_data mode for GRT with descending string sorts + added tests for this + -from Marc Prewitt + - made all appropriate tests support the -bench option + +0.01 Wed Mar 24 23:47:32 2004 + - original version; created by h2xs 1.21 with options + -AX -n Sort::Maker + diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..e9a0eda --- /dev/null +++ b/MANIFEST @@ -0,0 +1,24 @@ +Changes +Makefile.PL +Sort/Maker.pm +MANIFEST +README +t/errors.t +t/simple.t +t/arrays.t +t/hashes.t +t/io.t +t/ref_in_varying.t +t/descending_grt_string.t +t/GRT.t +t/numbers.t +t/regex.t +t/init_code.t +t/string_data.t +t/name.t +t/code.t +t/closure.t +t/bad_code.t +t/common.pm +META.yml Module meta-data (added by MakeMaker) +slides/sort_maker.txt diff --git a/META.yml b/META.yml new file mode 100644 index 0000000..fc7bd3d --- /dev/null +++ b/META.yml @@ -0,0 +1,10 @@ +# http://module-build.sourceforge.net/META-spec.html +#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# +name: Sort-Maker +version: 0.06 +version_from: Sort/Maker.pm +installdirs: site +requires: + +distribution_type: module +generated_by: ExtUtils::MakeMaker version 6.17 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a482a9d --- /dev/null +++ b/Makefile @@ -0,0 +1,728 @@ +# This Makefile is for the Sort extension to perl. +# +# It was generated automatically by MakeMaker version +# 6.17 (Revision: 1.133) from the contents of +# Makefile.PL. Don't edit this file, edit Makefile.PL instead. +# +# ANY CHANGES MADE HERE WILL BE LOST! +# +# MakeMaker ARGV: () +# +# MakeMaker Parameters: + +# ABSTRACT_FROM => q[Sort/Maker.pm] +# AUTHOR => q[Uri Guttman ] +# DISTNAME => q[Sort-Maker] +# NAME => q[Sort] +# PREREQ_PM => { } +# VERSION_FROM => q[Sort/Maker.pm] + +# --- MakeMaker post_initialize section: + + +# --- MakeMaker const_config section: + +# These definitions are from config.sh (via /usr/local/lib/perl5/5.8.6/sun4-solaris/Config.pm) + +# They may have been overridden via Makefile.PL or on the command line +AR = ar +CC = gcc +CCCDLFLAGS = -fPIC +CCDLFLAGS = +DLEXT = so +DLSRC = dl_dlopen.xs +LD = gcc +LDDLFLAGS = -G -L/usr/local/lib +LDFLAGS = -L/usr/local/lib +LIBC = /lib/libc.so +LIB_EXT = .a +OBJ_EXT = .o +OSNAME = solaris +OSVERS = 2.9 +RANLIB = : +SITELIBEXP = /usr/local/lib/perl5/site_perl/5.8.6 +SITEARCHEXP = /usr/local/lib/perl5/site_perl/5.8.6/sun4-solaris +SO = so +EXE_EXT = +FULL_AR = /usr/ccs/bin/ar +VENDORARCHEXP = +VENDORLIBEXP = + + +# --- MakeMaker constants section: +AR_STATIC_ARGS = cr +DIRFILESEP = / +NAME = Sort +NAME_SYM = Sort +VERSION = 0.06 +VERSION_MACRO = VERSION +VERSION_SYM = 0_06 +DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\" +XS_VERSION = 0.06 +XS_VERSION_MACRO = XS_VERSION +XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\" +INST_ARCHLIB = blib/arch +INST_SCRIPT = blib/script +INST_BIN = blib/bin +INST_LIB = blib/lib +INST_MAN1DIR = blib/man1 +INST_MAN3DIR = blib/man3 +MAN1EXT = 1 +MAN3EXT = 3 +INSTALLDIRS = site +DESTDIR = +PREFIX = +PERLPREFIX = /usr/local +SITEPREFIX = /usr/local +VENDORPREFIX = +INSTALLPRIVLIB = $(PERLPREFIX)/lib/perl5/5.8.6 +DESTINSTALLPRIVLIB = $(DESTDIR)$(INSTALLPRIVLIB) +INSTALLSITELIB = $(SITEPREFIX)/lib/perl5/site_perl/5.8.6 +DESTINSTALLSITELIB = $(DESTDIR)$(INSTALLSITELIB) +INSTALLVENDORLIB = +DESTINSTALLVENDORLIB = $(DESTDIR)$(INSTALLVENDORLIB) +INSTALLARCHLIB = $(PERLPREFIX)/lib/perl5/5.8.6/sun4-solaris +DESTINSTALLARCHLIB = $(DESTDIR)$(INSTALLARCHLIB) +INSTALLSITEARCH = $(SITEPREFIX)/lib/perl5/site_perl/5.8.6/sun4-solaris +DESTINSTALLSITEARCH = $(DESTDIR)$(INSTALLSITEARCH) +INSTALLVENDORARCH = +DESTINSTALLVENDORARCH = $(DESTDIR)$(INSTALLVENDORARCH) +INSTALLBIN = $(PERLPREFIX)/bin +DESTINSTALLBIN = $(DESTDIR)$(INSTALLBIN) +INSTALLSITEBIN = $(SITEPREFIX)/bin +DESTINSTALLSITEBIN = $(DESTDIR)$(INSTALLSITEBIN) +INSTALLVENDORBIN = +DESTINSTALLVENDORBIN = $(DESTDIR)$(INSTALLVENDORBIN) +INSTALLSCRIPT = $(PERLPREFIX)/bin +DESTINSTALLSCRIPT = $(DESTDIR)$(INSTALLSCRIPT) +INSTALLMAN1DIR = $(PERLPREFIX)/man/man1 +DESTINSTALLMAN1DIR = $(DESTDIR)$(INSTALLMAN1DIR) +INSTALLSITEMAN1DIR = $(SITEPREFIX)/man/man1 +DESTINSTALLSITEMAN1DIR = $(DESTDIR)$(INSTALLSITEMAN1DIR) +INSTALLVENDORMAN1DIR = +DESTINSTALLVENDORMAN1DIR = $(DESTDIR)$(INSTALLVENDORMAN1DIR) +INSTALLMAN3DIR = $(PERLPREFIX)/man/man3 +DESTINSTALLMAN3DIR = $(DESTDIR)$(INSTALLMAN3DIR) +INSTALLSITEMAN3DIR = $(SITEPREFIX)/man/man3 +DESTINSTALLSITEMAN3DIR = $(DESTDIR)$(INSTALLSITEMAN3DIR) +INSTALLVENDORMAN3DIR = +DESTINSTALLVENDORMAN3DIR = $(DESTDIR)$(INSTALLVENDORMAN3DIR) +PERL_LIB = /usr/local/lib/perl5/5.8.6 +PERL_ARCHLIB = /usr/local/lib/perl5/5.8.6/sun4-solaris +LIBPERL_A = libperl.a +FIRST_MAKEFILE = Makefile +MAKEFILE_OLD = $(FIRST_MAKEFILE).old +MAKE_APERL_FILE = $(FIRST_MAKEFILE).aperl +PERLMAINCC = $(CC) +PERL_INC = /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE +PERL = /usr/local/bin/perl +FULLPERL = /usr/local/bin/perl +ABSPERL = $(PERL) +PERLRUN = $(PERL) +FULLPERLRUN = $(FULLPERL) +ABSPERLRUN = $(ABSPERL) +PERLRUNINST = $(PERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +FULLPERLRUNINST = $(FULLPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +ABSPERLRUNINST = $(ABSPERLRUN) "-I$(INST_ARCHLIB)" "-I$(INST_LIB)" +PERL_CORE = 0 +PERM_RW = 644 +PERM_RWX = 755 + +MAKEMAKER = /usr/local/lib/perl5/5.8.6/ExtUtils/MakeMaker.pm +MM_VERSION = 6.17 +MM_REVISION = 1.133 + +# FULLEXT = Pathname for extension directory (eg Foo/Bar/Oracle). +# BASEEXT = Basename part of FULLEXT. May be just equal FULLEXT. (eg Oracle) +# PARENT_NAME = NAME without BASEEXT and no trailing :: (eg Foo::Bar) +# DLBASE = Basename part of dynamic library. May be just equal BASEEXT. +FULLEXT = Sort +BASEEXT = Sort +PARENT_NAME = +DLBASE = $(BASEEXT) +VERSION_FROM = Sort/Maker.pm +OBJECT = +LDFROM = $(OBJECT) +LINKTYPE = dynamic + +# Handy lists of source code files: +XS_FILES = +C_FILES = +O_FILES = +H_FILES = +MAN1PODS = +MAN3PODS = Sort/Maker.pm + +# Where is the Config information that we are using/depend on +CONFIGDEP = $(PERL_ARCHLIB)$(DIRFILESEP)Config.pm $(PERL_INC)$(DIRFILESEP)config.h + +# Where to build things +INST_LIBDIR = $(INST_LIB) +INST_ARCHLIBDIR = $(INST_ARCHLIB) + +INST_AUTODIR = $(INST_LIB)/auto/$(FULLEXT) +INST_ARCHAUTODIR = $(INST_ARCHLIB)/auto/$(FULLEXT) + +INST_STATIC = +INST_DYNAMIC = +INST_BOOT = + +# Extra linker info +EXPORT_LIST = +PERL_ARCHIVE = +PERL_ARCHIVE_AFTER = + + +TO_INST_PM = Sort/Maker.pm + +PM_TO_BLIB = Sort/Maker.pm \ + $(INST_LIB)/Sort/Maker.pm + + +# --- MakeMaker platform_constants section: +MM_Unix_VERSION = 1.42 +PERL_MALLOC_DEF = -DPERL_EXTMALLOC_DEF -Dmalloc=Perl_malloc -Dfree=Perl_mfree -Drealloc=Perl_realloc -Dcalloc=Perl_calloc + + +# --- MakeMaker tool_autosplit section: +# Usage: $(AUTOSPLITFILE) FileToSplit AutoDirToSplitInto +AUTOSPLITFILE = $(PERLRUN) -e 'use AutoSplit; autosplit($$ARGV[0], $$ARGV[1], 0, 1, 1)' + + + +# --- MakeMaker tool_xsubpp section: + + +# --- MakeMaker tools_other section: +SHELL = /bin/sh +CHMOD = chmod +CP = cp +MV = mv +NOOP = $(SHELL) -c true +NOECHO = @ +RM_F = rm -f +RM_RF = rm -rf +TEST_F = test -f +TOUCH = touch +UMASK_NULL = umask 0 +DEV_NULL = > /dev/null 2>&1 +MKPATH = $(PERLRUN) "-MExtUtils::Command" -e mkpath +EQUALIZE_TIMESTAMP = $(PERLRUN) "-MExtUtils::Command" -e eqtime +ECHO = echo +ECHO_N = echo -n +UNINST = 0 +VERBINST = 0 +MOD_INSTALL = $(PERLRUN) -MExtUtils::Install -e 'install({@ARGV}, '\''$(VERBINST)'\'', 0, '\''$(UNINST)'\'');' +DOC_INSTALL = $(PERLRUN) "-MExtUtils::Command::MM" -e perllocal_install +UNINSTALL = $(PERLRUN) "-MExtUtils::Command::MM" -e uninstall +WARN_IF_OLD_PACKLIST = $(PERLRUN) "-MExtUtils::Command::MM" -e warn_if_old_packlist + + +# --- MakeMaker makemakerdflt section: +makemakerdflt: all + $(NOECHO) $(NOOP) + + +# --- MakeMaker dist section: +TAR = tar +TARFLAGS = cvf +ZIP = zip +ZIPFLAGS = -r +COMPRESS = gzip --best +SUFFIX = .gz +SHAR = shar +PREOP = $(NOECHO) $(NOOP) +POSTOP = $(NOECHO) $(NOOP) +TO_UNIX = $(NOECHO) $(NOOP) +CI = ci -u +RCS_LABEL = rcs -Nv$(VERSION_SYM): -q +DIST_CP = best +DIST_DEFAULT = tardist +DISTNAME = Sort-Maker +DISTVNAME = Sort-Maker-0.06 + + +# --- MakeMaker macro section: + + +# --- MakeMaker depend section: + + +# --- MakeMaker cflags section: + + +# --- MakeMaker const_loadlibs section: + + +# --- MakeMaker const_cccmd section: + + +# --- MakeMaker post_constants section: + + +# --- MakeMaker pasthru section: + +PASTHRU = LIB="$(LIB)"\ + LIBPERL_A="$(LIBPERL_A)"\ + LINKTYPE="$(LINKTYPE)"\ + PREFIX="$(PREFIX)"\ + OPTIMIZE="$(OPTIMIZE)"\ + PASTHRU_DEFINE="$(PASTHRU_DEFINE)"\ + PASTHRU_INC="$(PASTHRU_INC)" + + +# --- MakeMaker special_targets section: +.SUFFIXES: .xs .c .C .cpp .i .s .cxx .cc $(OBJ_EXT) + +.PHONY: all config static dynamic test linkext manifest + + + +# --- MakeMaker c_o section: + + +# --- MakeMaker xs_c section: + + +# --- MakeMaker xs_o section: + + +# --- MakeMaker top_targets section: +all :: pure_all manifypods + $(NOECHO) $(NOOP) + + +pure_all :: config pm_to_blib subdirs linkext + $(NOECHO) $(NOOP) + +subdirs :: $(MYEXTLIB) + $(NOECHO) $(NOOP) + +config :: $(FIRST_MAKEFILE) $(INST_LIBDIR)$(DIRFILESEP).exists + $(NOECHO) $(NOOP) + +config :: $(INST_ARCHAUTODIR)$(DIRFILESEP).exists + $(NOECHO) $(NOOP) + +config :: $(INST_AUTODIR)$(DIRFILESEP).exists + $(NOECHO) $(NOOP) + +$(INST_AUTODIR)/.exists :: /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h + $(NOECHO) $(MKPATH) $(INST_AUTODIR) + $(NOECHO) $(EQUALIZE_TIMESTAMP) /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h $(INST_AUTODIR)/.exists + + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_AUTODIR) + +$(INST_LIBDIR)/.exists :: /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h + $(NOECHO) $(MKPATH) $(INST_LIBDIR) + $(NOECHO) $(EQUALIZE_TIMESTAMP) /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h $(INST_LIBDIR)/.exists + + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_LIBDIR) + +$(INST_ARCHAUTODIR)/.exists :: /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h + $(NOECHO) $(MKPATH) $(INST_ARCHAUTODIR) + $(NOECHO) $(EQUALIZE_TIMESTAMP) /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h $(INST_ARCHAUTODIR)/.exists + + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_ARCHAUTODIR) + +config :: $(INST_MAN3DIR)$(DIRFILESEP).exists + $(NOECHO) $(NOOP) + + +$(INST_MAN3DIR)/.exists :: /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h + $(NOECHO) $(MKPATH) $(INST_MAN3DIR) + $(NOECHO) $(EQUALIZE_TIMESTAMP) /usr/local/lib/perl5/5.8.6/sun4-solaris/CORE/perl.h $(INST_MAN3DIR)/.exists + + -$(NOECHO) $(CHMOD) $(PERM_RWX) $(INST_MAN3DIR) + +help: + perldoc ExtUtils::MakeMaker + + +# --- MakeMaker linkext section: + +linkext :: $(LINKTYPE) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dlsyms section: + + +# --- MakeMaker dynamic section: + +dynamic :: $(FIRST_MAKEFILE) $(INST_DYNAMIC) $(INST_BOOT) + $(NOECHO) $(NOOP) + + +# --- MakeMaker dynamic_bs section: + +BOOTSTRAP = + + +# --- MakeMaker dynamic_lib section: + + +# --- MakeMaker static section: + +## $(INST_PM) has been moved to the all: target. +## It remains here for awhile to allow for old usage: "make static" +static :: $(FIRST_MAKEFILE) $(INST_STATIC) + $(NOECHO) $(NOOP) + + +# --- MakeMaker static_lib section: + + +# --- MakeMaker manifypods section: + +POD2MAN_EXE = $(PERLRUN) "-MExtUtils::Command::MM" -e pod2man "--" +POD2MAN = $(POD2MAN_EXE) + + +manifypods : pure_all \ + Sort/Maker.pm \ + Sort/Maker.pm + $(NOECHO) $(POD2MAN) --section=3 --perm_rw=$(PERM_RW)\ + Sort/Maker.pm $(INST_MAN3DIR)/Sort::Maker.$(MAN3EXT) + + + + +# --- MakeMaker processPL section: + + +# --- MakeMaker installbin section: + + +# --- MakeMaker subdirs section: + +# none + +# --- MakeMaker clean_subdirs section: +clean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker clean section: + +# Delete temporary files but do not touch installed files. We don't delete +# the Makefile here so a later make realclean still has a makefile to use. + +clean :: clean_subdirs + -$(RM_RF) ./blib $(MAKE_APERL_FILE) $(INST_ARCHAUTODIR)/extralibs.all $(INST_ARCHAUTODIR)/extralibs.ld perlmain.c tmon.out mon.out so_locations pm_to_blib *$(OBJ_EXT) *$(LIB_EXT) perl.exe perl perl$(EXE_EXT) $(BOOTSTRAP) $(BASEEXT).bso $(BASEEXT).def lib$(BASEEXT).def $(BASEEXT).exp $(BASEEXT).x core core.*perl.*.? *perl.core core.[0-9] core.[0-9][0-9] core.[0-9][0-9][0-9] core.[0-9][0-9][0-9][0-9] core.[0-9][0-9][0-9][0-9][0-9] + -$(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) $(DEV_NULL) + + +# --- MakeMaker realclean_subdirs section: +realclean_subdirs : + $(NOECHO) $(NOOP) + + +# --- MakeMaker realclean section: + +# Delete temporary files (via clean) and also delete installed files +realclean purge :: clean realclean_subdirs + $(RM_RF) $(INST_AUTODIR) $(INST_ARCHAUTODIR) + $(RM_RF) $(DISTVNAME) + $(RM_F) $(INST_LIB)/Sort/Maker.pm $(MAKEFILE_OLD) $(FIRST_MAKEFILE) + + +# --- MakeMaker metafile section: +metafile : + $(NOECHO) $(ECHO) '# http://module-build.sourceforge.net/META-spec.html' > META.yml + $(NOECHO) $(ECHO) '#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#' >> META.yml + $(NOECHO) $(ECHO) 'name: Sort-Maker' >> META.yml + $(NOECHO) $(ECHO) 'version: 0.06' >> META.yml + $(NOECHO) $(ECHO) 'version_from: Sort/Maker.pm' >> META.yml + $(NOECHO) $(ECHO) 'installdirs: site' >> META.yml + $(NOECHO) $(ECHO) 'requires:' >> META.yml + $(NOECHO) $(ECHO) '' >> META.yml + $(NOECHO) $(ECHO) 'distribution_type: module' >> META.yml + $(NOECHO) $(ECHO) 'generated_by: ExtUtils::MakeMaker version 6.17' >> META.yml + + +# --- MakeMaker metafile_addtomanifest section: +metafile_addtomanifest: + $(NOECHO) $(PERLRUN) -MExtUtils::Manifest=maniadd -e 'eval { maniadd({q{META.yml} => q{Module meta-data (added by MakeMaker)}}) } ' \ + -e ' or print "Could not add META.yml to MANIFEST: $${'\''@'\''}\n"' + + +# --- MakeMaker dist_basics section: +distclean :: realclean distcheck + $(NOECHO) $(NOOP) + +distcheck : + $(PERLRUN) "-MExtUtils::Manifest=fullcheck" -e fullcheck + +skipcheck : + $(PERLRUN) "-MExtUtils::Manifest=skipcheck" -e skipcheck + +manifest : + $(PERLRUN) "-MExtUtils::Manifest=mkmanifest" -e mkmanifest + +veryclean : realclean + $(RM_F) *~ *.orig */*~ */*.orig + + + +# --- MakeMaker dist_core section: + +dist : $(DIST_DEFAULT) $(FIRST_MAKEFILE) + $(NOECHO) $(PERLRUN) -l -e 'print '\''Warning: Makefile possibly out of date with $(VERSION_FROM)'\''' \ + -e ' if -e '\''$(VERSION_FROM)'\'' and -M '\''$(VERSION_FROM)'\'' < -M '\''$(FIRST_MAKEFILE)'\'';' + +tardist : $(DISTVNAME).tar$(SUFFIX) + $(NOECHO) $(NOOP) + +uutardist : $(DISTVNAME).tar$(SUFFIX) + uuencode $(DISTVNAME).tar$(SUFFIX) $(DISTVNAME).tar$(SUFFIX) > $(DISTVNAME).tar$(SUFFIX)_uu + +$(DISTVNAME).tar$(SUFFIX) : distdir + $(PREOP) + $(TO_UNIX) + $(TAR) $(TARFLAGS) $(DISTVNAME).tar $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(COMPRESS) $(DISTVNAME).tar + $(POSTOP) + +zipdist : $(DISTVNAME).zip + $(NOECHO) $(NOOP) + +$(DISTVNAME).zip : distdir + $(PREOP) + $(ZIP) $(ZIPFLAGS) $(DISTVNAME).zip $(DISTVNAME) + $(RM_RF) $(DISTVNAME) + $(POSTOP) + +shdist : distdir + $(PREOP) + $(SHAR) $(DISTVNAME) > $(DISTVNAME).shar + $(RM_RF) $(DISTVNAME) + $(POSTOP) + + +# --- MakeMaker distdir section: +distdir : metafile metafile_addtomanifest + $(RM_RF) $(DISTVNAME) + $(PERLRUN) "-MExtUtils::Manifest=manicopy,maniread" \ + -e "manicopy(maniread(),'$(DISTVNAME)', '$(DIST_CP)');" + + + +# --- MakeMaker dist_test section: + +disttest : distdir + cd $(DISTVNAME) && $(ABSPERLRUN) Makefile.PL + cd $(DISTVNAME) && $(MAKE) $(PASTHRU) + cd $(DISTVNAME) && $(MAKE) test $(PASTHRU) + + +# --- MakeMaker dist_ci section: + +ci : + $(PERLRUN) "-MExtUtils::Manifest=maniread" \ + -e "@all = keys %{ maniread() };" \ + -e "print(qq{Executing $(CI) @all\n}); system(qq{$(CI) @all});" \ + -e "print(qq{Executing $(RCS_LABEL) ...\n}); system(qq{$(RCS_LABEL) @all});" + + +# --- MakeMaker install section: + +install :: all pure_install doc_install + +install_perl :: all pure_perl_install doc_perl_install + +install_site :: all pure_site_install doc_site_install + +install_vendor :: all pure_vendor_install doc_vendor_install + +pure_install :: pure_$(INSTALLDIRS)_install + +doc_install :: doc_$(INSTALLDIRS)_install + +pure__install : pure_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +doc__install : doc_site_install + $(NOECHO) $(ECHO) INSTALLDIRS not defined, defaulting to INSTALLDIRS=site + +pure_perl_install :: + $(NOECHO) $(MOD_INSTALL) \ + read $(PERL_ARCHLIB)/auto/$(FULLEXT)/.packlist \ + write $(DESTINSTALLARCHLIB)/auto/$(FULLEXT)/.packlist \ + $(INST_LIB) $(DESTINSTALLPRIVLIB) \ + $(INST_ARCHLIB) $(DESTINSTALLARCHLIB) \ + $(INST_BIN) $(DESTINSTALLBIN) \ + $(INST_SCRIPT) $(DESTINSTALLSCRIPT) \ + $(INST_MAN1DIR) $(DESTINSTALLMAN1DIR) \ + $(INST_MAN3DIR) $(DESTINSTALLMAN3DIR) + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + $(SITEARCHEXP)/auto/$(FULLEXT) + + +pure_site_install :: + $(NOECHO) $(MOD_INSTALL) \ + read $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist \ + write $(DESTINSTALLSITEARCH)/auto/$(FULLEXT)/.packlist \ + $(INST_LIB) $(DESTINSTALLSITELIB) \ + $(INST_ARCHLIB) $(DESTINSTALLSITEARCH) \ + $(INST_BIN) $(DESTINSTALLSITEBIN) \ + $(INST_SCRIPT) $(DESTINSTALLSCRIPT) \ + $(INST_MAN1DIR) $(DESTINSTALLSITEMAN1DIR) \ + $(INST_MAN3DIR) $(DESTINSTALLSITEMAN3DIR) + $(NOECHO) $(WARN_IF_OLD_PACKLIST) \ + $(PERL_ARCHLIB)/auto/$(FULLEXT) + +pure_vendor_install :: + $(NOECHO) $(MOD_INSTALL) \ + read $(VENDORARCHEXP)/auto/$(FULLEXT)/.packlist \ + write $(DESTINSTALLVENDORARCH)/auto/$(FULLEXT)/.packlist \ + $(INST_LIB) $(DESTINSTALLVENDORLIB) \ + $(INST_ARCHLIB) $(DESTINSTALLVENDORARCH) \ + $(INST_BIN) $(DESTINSTALLVENDORBIN) \ + $(INST_SCRIPT) $(DESTINSTALLSCRIPT) \ + $(INST_MAN1DIR) $(DESTINSTALLVENDORMAN1DIR) \ + $(INST_MAN3DIR) $(DESTINSTALLVENDORMAN3DIR) + +doc_perl_install :: + $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod + -$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB) + -$(NOECHO) $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" "$(INSTALLPRIVLIB)" \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> $(DESTINSTALLARCHLIB)/perllocal.pod + +doc_site_install :: + $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod + -$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB) + -$(NOECHO) $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" "$(INSTALLSITELIB)" \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> $(DESTINSTALLARCHLIB)/perllocal.pod + +doc_vendor_install :: + $(NOECHO) $(ECHO) Appending installation info to $(DESTINSTALLARCHLIB)/perllocal.pod + -$(NOECHO) $(MKPATH) $(DESTINSTALLARCHLIB) + -$(NOECHO) $(DOC_INSTALL) \ + "Module" "$(NAME)" \ + "installed into" "$(INSTALLVENDORLIB)" \ + LINKTYPE "$(LINKTYPE)" \ + VERSION "$(VERSION)" \ + EXE_FILES "$(EXE_FILES)" \ + >> $(DESTINSTALLARCHLIB)/perllocal.pod + + +uninstall :: uninstall_from_$(INSTALLDIRS)dirs + +uninstall_from_perldirs :: + $(NOECHO) $(UNINSTALL) $(PERL_ARCHLIB)/auto/$(FULLEXT)/.packlist + +uninstall_from_sitedirs :: + $(NOECHO) $(UNINSTALL) $(SITEARCHEXP)/auto/$(FULLEXT)/.packlist + +uninstall_from_vendordirs :: + $(NOECHO) $(UNINSTALL) $(VENDORARCHEXP)/auto/$(FULLEXT)/.packlist + + +# --- MakeMaker force section: +# Phony target to force checking subdirectories. +FORCE: + $(NOECHO) $(NOOP) + + +# --- MakeMaker perldepend section: + + +# --- MakeMaker makefile section: + +# We take a very conservative approach here, but it's worth it. +# We move Makefile to Makefile.old here to avoid gnu make looping. +$(FIRST_MAKEFILE) : Makefile.PL $(CONFIGDEP) + $(NOECHO) $(ECHO) "Makefile out-of-date with respect to $?" + $(NOECHO) $(ECHO) "Cleaning current config before rebuilding Makefile..." + $(NOECHO) $(RM_F) $(MAKEFILE_OLD) + $(NOECHO) $(MV) $(FIRST_MAKEFILE) $(MAKEFILE_OLD) + -$(MAKE) -f $(MAKEFILE_OLD) clean $(DEV_NULL) || $(NOOP) + $(PERLRUN) Makefile.PL + $(NOECHO) $(ECHO) "==> Your Makefile has been rebuilt. <==" + $(NOECHO) $(ECHO) "==> Please rerun the make command. <==" + false + + + +# --- MakeMaker staticmake section: + +# --- MakeMaker makeaperl section --- +MAP_TARGET = perl +FULLPERL = /usr/local/bin/perl + +$(MAP_TARGET) :: static $(MAKE_APERL_FILE) + $(MAKE) -f $(MAKE_APERL_FILE) $@ + +$(MAKE_APERL_FILE) : $(FIRST_MAKEFILE) + $(NOECHO) $(ECHO) Writing \"$(MAKE_APERL_FILE)\" for this $(MAP_TARGET) + $(NOECHO) $(PERLRUNINST) \ + Makefile.PL DIR= \ + MAKEFILE=$(MAKE_APERL_FILE) LINKTYPE=static \ + MAKEAPERL=1 NORECURS=1 CCCDLFLAGS= + + +# --- MakeMaker test section: + +TEST_VERBOSE=0 +TEST_TYPE=test_$(LINKTYPE) +TEST_FILE = test.pl +TEST_FILES = t/*.t +TESTDB_SW = -d + +testdb :: testdb_$(LINKTYPE) + +test :: $(TEST_TYPE) + +test_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) "-MExtUtils::Command::MM" "-e" "test_harness($(TEST_VERBOSE), '$(INST_LIB)', '$(INST_ARCHLIB)')" $(TEST_FILES) + +testdb_dynamic :: pure_all + PERL_DL_NONLAZY=1 $(FULLPERLRUN) $(TESTDB_SW) "-I$(INST_LIB)" "-I$(INST_ARCHLIB)" $(TEST_FILE) + +test_ : test_dynamic + +test_static :: test_dynamic +testdb_static :: testdb_dynamic + + +# --- MakeMaker ppd section: +# Creates a PPD (Perl Package Description) for a binary distribution. +ppd: + $(NOECHO) $(ECHO) '' > $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' $(DISTNAME)' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' A simple way to make efficient sort subs' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' Uri Guttman <uri@cpan.org>' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) ' ' >> $(DISTNAME).ppd + $(NOECHO) $(ECHO) '' >> $(DISTNAME).ppd + + +# --- MakeMaker pm_to_blib section: + +pm_to_blib: $(TO_INST_PM) + $(NOECHO) $(PERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', '\''$(PM_FILTER)'\'')'\ + Sort/Maker.pm $(INST_LIB)/Sort/Maker.pm + $(NOECHO) $(TOUCH) $@ + +# --- MakeMaker selfdocument section: + + +# --- MakeMaker postamble section: + + +# End. diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..d8d6b30 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,12 @@ +use ExtUtils::MakeMaker; +# See lib/ExtUtils/MakeMaker.pm for details of how to influence +# the contents of the Makefile that is written. +WriteMakefile( + 'NAME' => 'Sort', + 'DISTNAME' => 'Sort-Maker', + 'VERSION_FROM' => 'Sort/Maker.pm', # finds $VERSION + 'PREREQ_PM' => {}, # e.g., Module::Name => 1.1 + ($] >= 5.005 ? ## Add these new keywords supported since 5.005 + (ABSTRACT_FROM => 'Sort/Maker.pm', # retrieve abstract from module + AUTHOR => 'Uri Guttman ') : ()), +); diff --git a/README b/README new file mode 100644 index 0000000..eca556c --- /dev/null +++ b/README @@ -0,0 +1,28 @@ +Sort/Maker version 0.06 +======================= + +This module has two main goals: to make it easy to create correct sort +functions, and to make it simple to select the optimum sorting algorithm +for the number of items to be sorted. Sort::Maker generates complete +sort subroutines in one of four styles, plain, orcish manouver, +Schwartzian Transform and the Guttman-Rosler Transform. You can also get +the source for a sort sub you create via the sorter_source call. + +INSTALLATION + +To install this module type the following: + + perl Makefile.PL + make + make test + make install + +DEPENDENCIES + +None. + +COPYRIGHT AND LICENCE + +Licensed the same as Perl. + +Copyright (C) 2004-2006 Uri Guttman uri@sysarch.com diff --git a/Sort/Maker.pm b/Sort/Maker.pm new file mode 100644 index 0000000..a6084a4 --- /dev/null +++ b/Sort/Maker.pm @@ -0,0 +1,1789 @@ +package Sort::Maker; + +use strict; +use base qw(Exporter); + +use Data::Dumper ; + +our @EXPORT = qw( make_sorter ); +our %EXPORT_TAGS = ( 'all' => [ qw( sorter_source ), @EXPORT ] ); +our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); + +our $VERSION = '0.06'; + + +# get integer and float sizes endian order + +my $FLOAT_LEN = length pack "d", 1 ; +my $INT_LEN = length pack "N", 1 ; +my $INT_BIT_LEN = $INT_LEN * 8 ; +my $IS_BIG_ENDIAN = pack('N', 1) eq pack('L', 1) ; + +my @boolean_attrs = qw( + ascending + descending + case + no_case + signed + unsigned + signed_float + unsigned_float + varying + closure +) ; + +my @value_attrs = qw( + fixed +) ; + +my @grt_num_attrs = qw( + signed + unsigned + signed_float + unsigned_float +) ; + +my @grt_string_attrs = qw( + varying + fixed +) ; + +# these key attributes set are mutually exclusive +# only one can be set in the defaults or in any given key + +my @mutex_attrs = ( + [qw(case no_case)], + [qw(ascending descending)], + \@grt_num_attrs, + \@grt_string_attrs, +) ; + + +# code can only be an attribute and not a default attribute argument + +my %is_boolean_attr = map { $_ => 1 } @boolean_attrs ; +my %is_value_attr = map { $_ => 1 } @value_attrs, 'code' ; + +my @boolean_args = qw( + ref_in + ref_out + string_data +) ; + +my @value_args = qw( + name + init_code +) ; + +# all the attributes can be set with defaults + +my %is_boolean_arg = map { $_ => 1 } @boolean_args, @boolean_attrs ; +my %is_value_arg = map { $_ => 1 } @value_args, @value_attrs ; + +my @key_types = qw( + string + number +) ; + +my %is_key_arg = map { $_ => 1 } @key_types ; + +my %sort_makers = ( + + plain => \&_make_plain_sort, + orcish => \&_make_orcish_sort, + ST => \&_make_ST_sort, + GRT => \&_make_GRT_sort, +) ; + +my %is_arg = ( %is_key_arg, %sort_makers, %is_value_arg, %is_boolean_arg ) ; + +my %sources ; + +# this is a file lexical so the WARN handler sub can see it. + +my $eval_warnings = '' ; + +sub make_sorter { + +# clear any leftover errors + + $@ = '' ; + +# process @_ without copying it (&sub with no args) + + my( $options, $keys, $closures ) = &_process_arguments ; + return unless $keys ; + + my @closures = _get_extractor_code( $options, $keys ) ; + + return if $@ ; + +# get the sort maker for this style and build the sorter + + my $sort_maker = $sort_makers{ $options->{style} } ; + my $source = $sort_maker->( $options, $keys ) ; + return unless $source ; + +# prepend code to access any closures + + if ( @closures ) { + + my $closure_text = join '', map <{style}. +Check the key extraction code for errors. + +$source +$eval_warnings +$@ +ERR + +# install the sorter sub in the caller's package if a name was set + + if ( my $name = $options->{name} ) { + + no strict 'refs' ; + + my $package = (caller())[0] ; + + *{"${package}::$name"} = $sorter ; + } + + return $sorter ; +} + +sub _process_arguments { + + my( %options, @keys ) ; + + while( @_ ) { + + my $opt = shift ; + + if ( $sort_makers{ $opt } ) { + + $@ = + "make_sorter: Style was already set to '$options{ style }'", + return if $options{ style } ; + +# handle optional boolean => 1 + shift if @_ && $_[0] eq '1' ; + $options{ style } = $opt ; + $options{ $opt } = 1 ; + + next ; + } + + if ( $is_boolean_arg{ $opt } ) { + +# handle optional boolean => 1 + shift if @_ && $_[0] eq '1' ; + $options{ $opt } = 1 ; + next ; + } + + if ( $is_value_arg{ $opt } ) { + + $@ = "make_sorter: No value for argument '$opt'\n", + return unless @_ ; + + $options{ $opt } = shift ; + next ; + } + + if ( $is_key_arg{ $opt } ) { + + my $key_desc = $_[0] ; + +# if we have no key value or it is an option, we just have a single key. + + if ( !defined( $key_desc ) || $is_arg{ $key_desc } ) { + + push( @keys, { + type => $opt, + } + ) ; + + next ; + } + +# if we have a hash ref for the value, it is the description for this key + + if( ref $key_desc eq 'HASH' ) { + + shift @_ ; + $key_desc->{type} = $opt ; + push( @keys, $key_desc ) ; + next ; + } + +# if we have an array ref for the value, it is the description for this key + + if( ref $key_desc eq 'ARRAY' ) { + + $key_desc = _process_array_attrs(@{$key_desc}) ; + return unless $key_desc ; + + shift @_ ; + $key_desc->{type} = $opt ; + push( @keys, $key_desc ) ; + next ; + } + +# not a hash ref or an option/key so it must be code for the key + + shift ; + push( @keys, { + type => $opt, + code => $key_desc, + } + ) ; + next ; + } + + $@ = "make_sorter: Unknown option or key '$opt'\n" ; + return ; + } + + unless( @keys ) { + $@ = 'make_sorter: No keys specified' ; + return ; + } + + unless( $options{style} ) { + $@ = 'make_sorter: No sort style selected' ; + return ; + } + + return unless _process_defaults( \%options, \@keys ) ; + + return( \%options, \@keys ) ; +} + +sub _process_defaults { + + my( $opts, $keys ) = @_ ; + + return if _has_mutex_attrs( $opts, 'defaults have' ) ; + + $opts->{init_code} ||= '' ; + + foreach my $key ( @{$keys} ) { + + return if _has_mutex_attrs( $key, 'key has' ) ; + +# set descending if it is not ascending and the default is descending. + + $key->{'descending'} ||= + !$key->{'ascending'} && $opts->{'descending'} ; + +# set no_case if it is not case and the default is no_case. + + $key->{'no_case'} ||= + !$key->{'case'} && $opts->{'no_case'} ; + +# handle GRT default attrs, both number and string +# don't use the default if an attribute is set in the key + + unless( grep( $key->{$_}, @grt_num_attrs ) ) { + + @{$key}{@grt_num_attrs} = @{$opts}{@grt_num_attrs} ; + } + + unless( grep( $key->{$_}, @grt_string_attrs ) ) { + + @{$key}{@grt_string_attrs} = + @{$opts}{@grt_string_attrs} ; + } + } + + return 1 ; +} + + +sub _get_extractor_code { + + my( $opts, $keys ) = @_ ; + + my( @closures, $deparser ) ; + + foreach my $key ( @{$keys} ) { + + my $extract_code = $key->{code} ; + +# default extract code is $_ + + unless( $extract_code ) { + + $key->{code} = '$_' ; + next ; + } + + my $extractor_type = ref $extract_code ; + +# leave the extractor code alone if it is a string + + next unless $extractor_type ; + +# wrap regexes in m() + + if( $extractor_type eq 'Regexp' ) { + + $key->{code} = "m($extract_code)" ; + next ; + } + +# return an error if it is not a CODE ref + + unless( $extractor_type eq 'CODE' ) { + + $@ = "$extract_code is not a CODE or Regexp reference" ; + return ; + } + +# must be a code reference +# see if it is a closure + + if ( $opts->{closure} || $key->{closure} ) { + +# generate the code that will call this closure + + my $n = @closures ; + $key->{code} = "\$closure$n->()" ; + +#print "CODE $key->{code}\n" ; + +# copy the closure so we can process them later + + push @closures, $extract_code ; + next ; + } + +# Otherwise, try to decompile the code ref with B::Deparse... + + unless( require B::Deparse ) { + + $@ = <new("-p", "-sC"); + + my $source = eval { $deparser->coderef2text( $extract_code ) } ; + + unless( $source ) { + + $@ = "Can't use [$extract_code] as key extractor"; + return ; + } + + #print "S [$source]\n" ; + +# use just the juicy pulp inside the braces... + + $key->{code} = "do $source" ; + } + + return @closures ; +} + + +# this is used to check for any mutually exclusive attribute in +# defaults or keys + +sub _has_mutex_attrs { + + my( $href, $name ) = @_ ; + + foreach my $mutex ( @mutex_attrs ) { + + my @bad_attrs = grep $href->{$_}, @{$mutex} ; + + next if @bad_attrs <= 1 ; + + $@ = "make_sorter: Key attribute conflict: '$name @bad_attrs'"; + return 1 ; + } + + return ; +} + +sub _process_array_attrs { + + my( @attrs ) = @_ ; + + my $desc ; + + while( @attrs ) { + + my $attr = shift @attrs ; + +#print "ATTR $attr\n" ; + + if ( $is_boolean_attr{ $attr } ) { + + shift @attrs if $attrs[0] eq '1' ; + $desc->{ $attr } = 1 ; + next ; + } + + if ( $is_value_attr{ $attr } ) { + + $@ = "make_sorter: No value for attribute '$attr'", + return unless @attrs ; + + $desc->{ $attr } = shift @attrs ; + next ; + } + + $@ = "make_sorter: Unknown attribute '$attr'" ; + return ; + } + + return( $desc ) ; +} + +sub _make_plain_sort { + + my( $options, $keys ) = @_ ; + + my( @plain_compares ) ; + + foreach my $key ( @{$keys} ) { + + my $plain_compare = <{descending} ; + $plain_compare =~ s/cmp/<=>/ if $key->{type} eq 'number' ; + $plain_compare =~ s/uc //g + unless $key->{type} eq 'string' && $key->{no_case} ; + $plain_compare =~ s/EXTRACT/$key->{code}/ ; + + push( @plain_compares, $plain_compare ) ; + } + +# build the full compare block + + my $compare_source = join "\t\t||\n", @plain_compares ; + +# handle the in/out as ref options + + my $input = $options->{ref_in} ? '@{$_[0]}' : '@_' ; + my( $open_bracket, $close_bracket ) = $options->{ref_out} ? + qw( [ ] ) : ( '', '' ) ; + + my $source = <{init_code} + $open_bracket + sort { +$compare_source + } $input $close_bracket ; +} +SUB + + return $source ; +} + +sub _make_orcish_sort { + + my( $options, $keys ) = @_ ; + + my( @orcish_compares ) ; + + my $orc_ind = '1' ; + + foreach my $key ( @{$keys} ) { + + my( $l, $r ) = $key->{descending} ? qw( $b $a ) : qw( $a $b ) ; + + my $orcish_compare = <{descending} ; + $orcish_compare =~ s/cmp/<=>/ if $key->{type} eq 'number' ; + $orcish_compare =~ s/uc //g + unless $key->{type} eq 'string' && $key->{no_case} ; + + $orcish_compare =~ s/EXTRACT/$key->{code}/g ; + + push( @orcish_compares, $orcish_compare ) ; + } + +# build the full compare block + + my $compare_source = join "\t\t||\n", @orcish_compares ; + +# handle the in/out as ref options + + my $input = $options->{ref_in} ? '@{$_[0]}' : '@_' ; + my( $open_bracket, $close_bracket ) = $options->{ref_out} ? + qw( [ ] ) : ( '', '' ) ; + + my $cache_dcl = join( ',', map "%or_cache$_", 1 .. @{$keys} ) ; + + my $source = <{init_code} + my ( $cache_dcl ) ; + + $open_bracket + sort { +$compare_source + } $input $close_bracket ; +} +SUB + + return $source ; +} + +sub _make_ST_sort { + + my( $options, $keys ) = @_ ; + + my( @st_compares, @st_extracts ) ; + my $st_ind = '1' ; + + foreach my $key ( @{$keys} ) { + +#print Dumper $key ; + + my $st_compare = <[$st_ind] cmp \$b->[$st_ind] +CMP + + $st_compare =~ tr/ab/ba/ if $key->{descending} ; + $st_compare =~ s/cmp/<=>/ if $key->{type} eq 'number' ; + + $st_ind++ ; + + push( @st_compares, $st_compare ) ; + + my $st_extract = <{type} eq 'string' && $key->{no_case} ; + $st_extract =~ s/EXTRACT/$key->{code}/ ; + + chomp( $st_extract ) ; + push( @st_extracts, $st_extract ) ; + } + +# build the full compare block + + my $compare_source = join "\t\t||\n", @st_compares ; + +# build the full code for the key extracts + + my $extract_source = join ",\n", @st_extracts ; + +# handle the in/out as ref options + + my $input = $options->{ref_in} ? '@{$_[0]}' : '@_' ; + my( $open_bracket, $close_bracket ) = $options->{ref_out} ? + qw( [ ] ) : ( '', '' ) ; + + my $source = <{init_code} + return $open_bracket + map \$_->[0], + sort { +$compare_source + } + map [ \$_, +$extract_source + ], $input $close_bracket ; +} +SUB + +} + +sub _make_GRT_sort { + + my( $options, $keys ) = @_ ; + + my( $pack_format, @grt_extracts ) ; + + my $init_code = $options->{init_code} ; + +# select the input as a list - either an array ref or plain @_ + + my $input = $options->{ref_in} ? '@{$_[0]}' : '@_' ; + +# use this to count keys so we can generate init_code for each key + + my $key_ind = '0' ; + + foreach my $key ( @{$keys} ) { + +#print Dumper $key ; + + my( $key_pack_format, $grt_extract, $key_init_code ) = + $key->{type} eq 'number' ? + _make_GRT_number_key( $key ) : + _make_GRT_string_key( $key, $key_ind++ ) ; + +#print "[$key_pack_format] [$grt_extract] [$key_init_code]\n" ; + + return unless $key_pack_format ; + + $pack_format .= $key_pack_format ; + + if ( $key_init_code ) { + +# fix generated init_code that scans input to use the proper input + + $key_init_code =~ s/INPUT$/$input/m ; + $init_code .= $key_init_code ; + } + + chomp( $grt_extract ) ; + push( @grt_extracts, $grt_extract ) ; + } + +############ +# pack the record index. +# SKIP for 'string_data' attribute +########## + + $pack_format .= 'N' unless $options->{string_data} ; + + my $extract_source = join ",\n", @grt_extracts ; + chomp( $extract_source ) ; + +# handle the in/out as ref options + + my( $open_bracket, $close_bracket ) = $options->{ref_out} ? + qw( [ ] ) : ( '', '' ) ; + + + my $get_index_code = <{string_data} ? <{descending} ) { + +# negate the key values so they sort in descending order + + $negate_code = '$val = -$val; ' ; + +# descending GRT number sorts must be signed to handle the negated values + + $key->{signed} = 1 if delete $key->{unsigned} ; + $key->{signed_float} = 1 if delete $key->{unsigned_float} ; + } + else { + + $negate_code = '' ; + } + + if ( $key->{unsigned} ) { + + $pack_format = 'N' ; + $val_code = '$val' ; + } + elsif ( $key->{signed} ) { + +# convert the signed integer to unsigned by flipping the sign bit + + $pack_format = 'N' ; + $val_code = "\$val ^ (1 << ($INT_BIT_LEN - 1))" + } + elsif ( $key->{unsigned_float} ) { + +# pack into A format with a length of a float + + $pack_format = "A$FLOAT_LEN" ; + $val_code = qq{ $FLOAT_PACK ^ "\\x80" } ; + } + else { + +# must be a signed float + + $pack_format = "A$FLOAT_LEN" ; + +# debug code that can be put in to dump what is being packed. +# print "V [\$val]\\n" ; +# print unpack( 'H*', pack 'd', \$val ), "\\n" ; + + +# only negate float numbers other than 0. in some odd cases a float 0 +# gets converted to a -0 (which is a legal ieee float) and the GRT +# packs it as 0x80000.. instead of 0x00000....) + +# it happens on sparc and perl 5.6.1. it needs a math op (the tests +# runs the gold sort which does <=> on it) and then negation for -0 to +# show up. 5.8 on sparc is fine and all perl versions on intel are +# fine + +# the 'signed float edge case descending' test in t/numbers.t +# looks for this. + + $negate_code =~ s/;/ if \$val;/ ; + + $val_code = qq{ $FLOAT_PACK ^ + ( \$val < 0 ? "$XOR_NEG" : "\\x80" ) + } ; + } + + my $grt_extract = <{code} ; $negate_code$val_code } +CODE + + return( $pack_format, $grt_extract, '' ) ; +} + +sub _make_GRT_string_key { + + my( $key, $key_ind ) = @_ ; + + my( $init_code, $pack_format ) ; + + if ( my $fix_len = $key->{fixed} ) { + +# create the xor string to invert the key for a descending sort. + $init_code = <{descending} ; + my \$_xor$key_ind = "\\xFF" x $fix_len ; +CODE + $pack_format = "a$fix_len" ; + + } + elsif ( $key->{varying} ) { + +# create the code to scan for the maximum length of the values for this key +# the INPUT will be changed later to handle a list or a ref as input + + $init_code = <{code} ; length \$val } INPUT + ) ; +CODE + +# create the xor string to invert the key for a descending sort. + + $init_code .= <{descending} ; + my \$_xor$key_ind = "\\xFF" x \$len$key_ind ; +CODE + +# we pack as a null padded string. its length is in the + + $pack_format = "a\${len$key_ind}" ; + } + else { + +# we can't sort plain (null terminated) strings in descending order + + $@ = <{descending} ; +make_sorter: A GRT descending string needs to select either the +'fixed' or 'varying' attributes +ERR + + $pack_format = 'Z*' ; + } + + my $descend_code = $key->{descending} ? " . '' ^ \$_xor$key_ind" : '' ; + + my $grt_extract = <{no_case} ; + $grt_extract =~ s/EXTRACT/$key->{code}/ ; + + return( $pack_format, $grt_extract, $init_code ) ; +} + +sub sorter_source { + + $sources{ +shift || '' } ; +} + +1 ; + +__END__ + +=head1 NAME + +Sort::Maker - A simple way to make efficient sort subs + +=head1 SYNOPSIS + + use Sort::Maker ; + + my $sorter = make_sorter( ... ) ; + + +=head1 DESCRIPTION + +This module has two main goals: to make it easy to create correct sort +functions, and to make it simple to select the optimum sorting +algorithm for the number of items to be sorted. Sort::Maker generates +complete sort subroutines in one of four styles, plain, orcish +manouver, Schwartzian Transform and the Guttman-Rosler Transform. You +can also get the source for a sort sub you create via the +sorter_source call. + +=head1 C + +The sub C is exported by Sort::Maker. It makes a sort sub +and returns a reference to it. You describe how you want it to sort +its input by passing general options and key descriptions to +C. + +=head2 Arguments to C + +There are two types of arguments, boolean and value. Boolean arguments +can be set just with the option name and can optionally be followed by +'1'. You can easily set multiple boolean general arguments with +qw(). Value arguments must have a following value. Arguments can +appear in any order but the key descriptions (see below) must appear +in their sort order. The code examples below show various ways to set +the various arguments. + +Arguments fall into four categories: selecting the style of the sort, +key descriptions, setting defaults for key description attributes, and +setting general flags and values. The following sections will describe +the categories and their associated arguments. + +=head2 Sort Style + +The style of the sort to be made is selected by setting one of the +following Boolean arguments. Only one may be set otherwise an error +is reported (see below for error handling). Also see below for +detailed descriptions of the supported sort styles. + + plain + orcish + ST + GRT + + # Make a plain sorter + my $plain_sorter = make_sorter( qw( plain ) ... ) ; + + # Make an orcish manouevre sorter + my $orcish_sorter = make_sorter( orcish => 1 ... ) ; + + # Make a Schwartzian Transform sorter + my $st_sorter = make_sorter( 'ST', 1, ... ) ; + + # Make a GRT sort + my $GRT = make_sorter( 'GRT', ... ) ; + +=head2 Key Attribute Defaults + +The following arguments set defaults for the all of the keys' +attributes. These default values can be overridden in any individual +key. Only one of the attributes in each of the groups below can be +set as defaults or for any given key. If more than one attribute in +each group is set, then C will return an error. The +attribute that is the default for each group is marked. See below for +details on key attributes. + + ascending (default) + descending + + case (default) + no_case + + signed + unsigned + signed_float (default) + unsigned_float + + fixed + varying + +=head2 General Options + +These arguments set general options that apply to how the generated +sorter interacts with the outside world. + +=head3 C + +This is a value option which exports the generated sort sub to that +name. The call to C must be run to install the named +named function before it is called. You should still check the result +of C to see if an error occurred (it returns undef). + + my $sorter = make_sorter( name => 'sort_func', ... ) ; + die "make_sorter: $@" unless $sorter ; + + ... + + @sorted = sort_func @unsorted ; + +=head3 C + +This boolean arguments specifies that the input to and output from the +sort sub will be array references. C makes the sorter only +take as input a single array reference (which contains the unsorted +records). C makes the sorter only return a single array +reference (which contains the sorted records). You can set both of +these options in a sorter. + +Note: This does not affect key extraction code which still gets each +record in C<$_>. It only modifies the I/O of the generated sorter. + + # input records are in an array reference + my $sorter = make_sorter( qw( ref_in ), ... ) ; + @sorted_array = $sorter->( \@unsorted_input ) ; + + # sorted output records are in an array reference + my $sorter = make_sorter( ref_out => 1, ... ) ; + $sorted_array_ref = $sorter->( @unsorted_input ) ; + + # input and output records are in array references + my $sorter = make_sorter( qw( ref_in ref_out ), ... ) ; + $sorted_array_ref = $sorter->( \@unsorted_input ) ; + +=head3 C + +This boolean argument specifies that the input records will be plain +text strings with no null (0x0) bytes in them. It is only valid for +use with the GRT and it is ignored for the other sort styles. It tells +the GRT that it can put the record directly into the string cache and +it will be separated from the packed keys with a null byte (hence that +restriction). This is an optimization that can run slightly faster +than the normal index sorting done with the GRT. Run this to see the +benchmark results. + + perl t/string_data.t -bench + +=head3 C + +This value argument is code that will be put into the beginning of the +generated sorter subroutine. It is meant to be used to declare lexical +variables that the extraction code can use. Normally different +extraction code have no way to share common code. By declaring +lexicals with the C option, some key extraction code +can save data there for use by another key. This is useful if you have +two (or more) keys that share a complex piece of code such as +accessing a deep value in a record tree. + +For example, suppose the input record is an array of arrays of hashes +of strings and the string has 2 keys that need to be grabbed by a +regex. The string is a string key, a ':' and a number key. So the +common part of the key extraction is: + + $_->[0][0]{a} + +And the make_sorter call is: + + my $sorter = make_sorter( + 'ST', + init_code => 'my( $str, $num ) ;', + string => 'do{( $str, $num ) = + $_->[0][0]{a} =~ /^(\w+):(\d+)$/; $str}', + number => '$num' + ) ; + +In the above code both keys are extracted in the first key extraction +code and the number key is saved in C<$num>. The second key extraction +code just uses that saved value. + +Note that C is only useful in the ST and GRT sort styles as +they process all the keys of a record at one time and can use +variables declared in C to transfer data to later keys. The +plain and orcish sorts may not process a later key at the same time as +an earlier key (that only happens when the earlier key is compared to +an equal key). Also for C to be a win, the data set must be +large enough and the work to extract the keys must be hard enough for +the savings to be noticed. The test init_code.t shows some examples +and you can see the speedup when you run: + + perl t/init_code.t -bench + +=head2 Key Description Arguments + +Sorting data requires that records be compared in some way so they can +be put into a proper sequence. The parts of the records that actually +get compared are called its keys. In the simplest case the entire +record is the key, as when you sort a list of numbers or file +names. But in many cases the keys are embedded in the full record and +they need to be extracted before they can be used in comparisons. +Sort::Maker uses key descriptions that extract the key from the +record, and optional other attributes that will help optimize the +sorting operation. This section will explain how to pass key +description arguments to the make_sorter subroutine and what the +various attributes mean and how to best use them. + +The generated sorter will sort the records according to the order of +the key arguments. The first key is used to compare a pair of records +and if they are deemed equal, then the next key is examined. This happens +until the records are given an ordering or you run out of keys and the +records are deemed equal in sort order. Key descriptions can be mixed +with the other arguments which can appear in any order and anywhere in +the argument list, but the keys themselves must be in the desired +order. + +A key argument is either 'string' or 'number' followed by optional +attributes. The key type sets the way that the key is compared +(e.g. using 'cmp' or '<=>'). All key attributes can be set from the +default values in the global arguments or set in each individual key +description. + +There are 4 ways to provide attributes to a key: + +=head3 No attributes + +A key argument which is either at the end of the argument list or is +followed by a valid keyword token has no explict attributes. This key +will use the default attributes. In both of these examples, a default +attribute was set and used by the key description which is just a +single key argument. + + # sort the record as a single number in descending order + my $sorter = make_sorter( qw( plain number descending ) ) ; + + # sort the record as a case sensitive string + my $sorter = make_sorter( qw( plain case string ) ) ; + + # sort the record as a single number in ascending order + my $sorter = make_sorter( qw( ST number ) ) ; + +=head3 Only Code as a Value + +A key argument which is followed by a scalar value which is not a +valid keyword token, will use that scalar value as its key extraction +code. See below for more on key extraction code. + + # sort by the first (optionally signed) number matched + my $sorter = make_sorter( qw( plain number /([+-]?\d+)/ ) ) ; + + # string sort by the 3rd field in the input records (array refs) + my $sorter = make_sorter( 'ST', string => '$_->[2]' ) ; + +=head3 An Array Reference + +A key argument which is followed by an array reference will parse that +array for its description attributes. As with the general boolean +arguments, any boolean attribute can be optionally followed by a +'1'. Value attributes must be followed by their value. + + # another way to specify the same sort as above + # sort by the first (optionally signed) number matched + + my $sorter = make_sorter( + qw( plain ), + number => [ + code => '/(\d+)/', + 'descending', + ], + ) ; + + # same sort but for the GRT which uses the 'unsigned' + # attribute to optimize the sort. + + my $sorter = make_sorter( + qw( GRT ), + number => [ + qw( descending unsigned ), + code => '/(\d+)/', + ], + ) ; + +=head3 A Hash Reference + +A key argument which is followed a hash reference will use that hash +as its description attributes. Any boolean attribute in the hash must +have a value of '1'. Value attributes must be followed by their +value. + + # another way to specify the same sort as above + # sort by the first (optionally signed) number matched + + my $sorter = make_sorter( + qw( plain ), + number => { + code => '/(\d+)/', + descending => 1, + }, + ) ; + + # a multi-key sort. the first key is a descending unsigned + # integer and the second is a string padded to 10 characters + + my $sorter = make_sorter( + qw( GRT ), + number => { + code => '/(\d+)/', + descending => 1, + unsigned => 1, + }, + string => { + code => '/FOO<(\w+)>/', + fixed => 10, + }, + ) ; + +=head2 Key Description Attributes + +What follows are the attributes for key descriptions. Most use +the default values passed in the arguments to C. + +=head3 C + +This value attribute is the code that will be used to extract a key +from the input record. It can be a string of Perl code, a qr// regular +expression (Regexp reference) or an anonymous sub (CODE reference) +that operates on $_ and extracts a value. The code will be wrapped in +a do{} block and called in a list context so that regular expressions +can just use () to grab a key value. The code defaults to C<$_> which +means the entire record is used for this key. You can't set the +default for code (unlike all the other key attributes). See the +section on Extraction Code for more. + + # make an ST sort of the first number grabbed in descending order + + my $sorter = make_sorter( + qw( ST ), + number => { + code => '/(\d+)/', + descending => 1, + }, + ) ; + +=head3 C + +These two Boolean attributes control the sorting order for this +key. If a key is marked as C (which is the initial default +for all keys), then lower keys will sort before higher +keys. C sorts have the higher keys sort before the lower +keys. It is illegal to have both set in the defaults or in any key. + + # sort by descending order of the first grabbed number + # and then sort in ascending order the first grabbed + + my $sorter = make_sorter( + qw( ST descending ), + number => { + code => '/(\d+)/', + }, + string => { + code => '/<(\w+)>/', + ascending => 1, + }, + ) ; + + # this will return undef and store an error in $@. + # you can't have both 'ascending' and 'descending' as defaults + + my $sorter = make_sorter( + qw( ST ascending descending ), + number => { + code => '/(\d+)/', + descending => 1, + }, + ) ; + + # this will return undef and store an error in $@. + # you can't have both 'ascending' and 'descending' in a key + + my $sorter = make_sorter( + qw( ST ) + number => { + code => '/(\d+)/', + descending => 1, + ascending => 1, + }, + ) ; + +=head3 C + +These two Boolean attributes control how 'string' keys handle case +sensitivity. If a key is marked as C (which is the initial +default for all keys), then keys will treat upper and lower case +letters as different. If the key is marked as C then they +are treated as equal. It is illegal to have both set in the defaults +or in any key. Internally this uses the uc() function so you can use +locale settings to affect string sorts. + + # sort by the first grabbed word with no case + # and then sort the grabbed with case + + my $sorter = make_sorter( + qw( ST no_case ), + string => { + code => '/(\w+)/', + }, + string => { + code => '/<(\w+)>/', + case => 1, + }, + ) ; + + # this will return undef and store an error in $@. + # you can't have both 'case' and 'no_case' as defaults + + my $sorter = make_sorter( + qw( ST no_case case ), + string => { + code => '/(\w+)/', + }, + ) ; + + # this will return undef and store an error in $@. + # you can't have both 'case' and 'no_case' in a key + + my $sorter = make_sorter( + qw( ST ) + string => { + code => '/(\w+)/', + no_case => 1, + case => 1, + }, + ) ; + +=head3 C + +This Boolean attribute causes this key to use call its CODE reference +to extract its value. This is useful if you need to access a lexical +variable during the key extraction. A typical use would be if you have +a sorting order stored in a lexical and need to access that from the +extraction code. If you didn't set the C attribute for this +key, the generated source (see Key Extraction) would not be able to +see that lexical which will trigger a Perl compiling error in +make_sorter. + + my @months = qw( + January February March April May June + July August September October November December ) ; + my @month_jumble = qw( + February June October March January April + July November August December May September ) ; + + my %month_to_num ; + @month_to_num{ @months } = 1 .. @months ; + +# this will fail to generate a sorter if 'closure' is removed +# as %month_to_num will not be in scope to the eval inside sort_maker. + + my $sorter = make_sorter( + 'closure', + number => sub { $month_to_num{$_} }, + ) ; + + my @sorted = $sorter->( @month_jumble ) ; + + +=head3 C (GRT only) + +These Boolean attributes are only used by the GRT sort style. They are +meant to describe the type of a number key so that the GRT can best +process and cache the key's value. It is illegal to have more than one +of them set in the defaults or in any key. See the section on GRT +sorting for more. + +The C and C attributes mark this number key as an +integer. The GRT does the least amount of work processing an unsigned +integer and only slightly more work for a signed integer. It is worth +using these attributes if a sort key is restricted to integers. + +The C (which is the normal default for all keys) and +C attributes mark this number key as a float. The GRT +does the less work processing an unsigned float then a signed float. +It is worth using the C attribute if a sort key is +restricted to non-negative values. The C attribute is +supported to allow overriding defaults and to make it easier to +auto-generate sorts. + +=head3 C (GRT only) + +These attributes are only used by the GRT sort style. They are used +to describe the type of a string key so that the GRT can properly +process and cache the key's value. It is illegal to have more than one +of them set in the defaults or in any key. See the section on GRT +sorting for more. + +C is a value attribute that marks this string key as always +being this length. The extracted value will either be padded with null +(0x0) bytes or truncated to the specified length (the value of +C). The data in this key may have embedded null bytes (0x0) and +may be sorted in descending order. + +C is a Boolean attribute marks this string key as being of +varying lengths. The GRT sorter will do a scan of all of this key's +values to find the maximum string length and then it pads all the +extracted values to that length. The data in this key may have +embedded null bytes (0x0) and may be sorted in descending order. + +=head2 Key Extraction Code + +Each input record must have its sort keys extracted from the data. +This is the purpose of the 'code' attribute in key descriptions. The +code has to operate on a record which is in C<$_> and it must return +the key value. The code is executed in a list context so you can use +grabs in m// to return the key. Note that only the first grab will be +used but you shouldn't have more than one anyway. See the examples +below. + +Code can be either a string, a qr// object (Regexp reference) or an +anonymous sub (CODE reference). + +If qr// is used, the actual generated code will be m($qr) which works +because qr// will interpolate to its string representation. The +advantage of qr// over a string is that the qr// will be syntax +checked at compile time while the string only later when the generated +sorter is compiled by an eval. + +If a CODE reference is found, it is used to extract the key in the +generated sorter. As with qr//, the advantage is that the extraction +code is syntax checked at compile time and not runtime. Also the +deparsed code is wrapped in a C block so you may use complex +code to extract the key. In the default case a CODE reference will be +deparsed by the B::Deparse module into Perl source. If the key has the +C attribute set, the code will be called to extract the key. + +The following will generate sorters with exact same behavior: + + $sorter = make_sorter( 'ST', string => '/(\w+)/' ) ; + $sorter = make_sorter( 'ST', string => qr/(\w+)/ ) ; + $sorter = make_sorter( 'ST', string => sub { /(\w+)/ } ) ; + $sorter = make_sorter( 'ST', 'closure', string => sub { /(\w+)/ } ) ; + +Extraction code for a key can be set in one of three ways. + +=head3 No explicit code + +If you don't pass any extraction code to a key, it will default to C<$_> +which is the entire record. This is useful in certain cases such as in +simple sorts where you are sorting the entire record. + + # sort numerically and in reverse order + my $sorter = make_sorter( qw( plain number descending ) ; + + # sort with case folding + my $sorter = make_sorter( qw( plain no_case string ) ; + + # sort by file time stamp and then by name + my $sorter = make_sorter( 'ST', number => '-M', 'string' ) ; + +=head3 Code is the only key attribute + +In many cases you don't need to specify any specific key attributes (the +normal or globally set defaults are fine) but you need extraction +code. If the argument that follows a key type ( 'string' or 'number' ) +is not a valid keyword, it will be assumed to be the extraction code +for that key. + + # grab the first number string as the key + my $sorter = make_sorter( qw( plain number /(\d+)/ ) ) ; + + # no_case string sort on the 3rd-5th chars of the 2nd array element + my $sorter = make_sorter( + plain => 1, + no_case => 1, + string => 'substr( $_->[1], 2, 3)' + ) ; + +=head3 Key needs specific attributes + +When the key needs to have its own specific attributes other than its +code, you need to pass them in an ARRAY or HASH reference. This is +mostly needed when there are multiple keys and the defaults are not +correct for all the keys. + + # string sort by the first 3 elements of the array record with + # different case requirements + + my $sorter = make_sorter( + ST => 1, + string => { + code => '$_->[0]', + no_case => 1, + }, + string => '$_->[1]', + string => { + code => '$_->[2]', + no_case => 1, + }, + ) ; + + # GRT sort with multiple unsigned integers and padded strings + # note that some keys use a hash ref and some an array ref + # the record is marked with key\d: sections + my $sorter = make_sorter( + GRT => 1, + descending => 1, + number => { + code => 'key1:(\d+)', + unsigned => 1, + }, + number => [ + code => 'key2:([+-]?\d+)', + qw( signed ascending ), + ], + string => [ + code => 'key3:(\w{10})', + fixed => 1, + ascending => 1, + ], + # pad the extracted keys to 8 chars + string => { + code => 'key4:([A-Z]+)', + pad => 8, + }, + ) ; + +=head1 Key Caching + +A good question to ask is "What speed advantages do you get from this +module when all the sorts generated use Perl's internal sort function?" +The sort function has a O( N * log N ) growth function which means that +the amount of work done increases by that formula as N (the number of +input records) increases. In a plain sort this means the the key +extraction code is executed N * log N times when you only have N +records. That can be a very large waste of cpu time. So the other three +sort styles speed up the overall sort by only executing the extraction +code N times by caching the extracted keys. How they cache the keys is +their primary difference. To compare or study the actual code generated +for the different sort styles, you can run make_sorter and just change +the style. Then call sorter_source (not exported by default) and pass it +the sort code reference returned by make_sorter. It will return the +generated sort source. + +=head2 C + +Plain sorting doesn't do any key caching. It is fine for short input +lists (see the Benchmark section) and also as a way to see how much CPU +is saved when using one of the other styles. + +=head2 C + +The Orcish maneuvre (created by Joseph Hall) caches the extracted keys +in a hash. It does this with code like this: + + $cache{$a} ||= CODE($a) ; + +CODE is the extract code and it operates on a record in $a. If we have +never seen this record before then the cache entry will be undef and the +||= operator will assign the extracted key to that hash slot. The next +time this record is seen in a comparison, the saved extracted key will +be found in the hash and used. The name orcish comes from OR-cache. + +=head2 C + +The ST (Schwartzian Transform and popularized by Randal Schwartz) uses +an anonymous array to store the record and its extracted keys. It +first executes a map that creates an anonymous array: + + map [ $_, CODE1( $_ ), CODE2( $_ ) ], @input + +The CODE's extract the set of keys from the record but only once per +record so it is O(N). Now the sort function can just do the comparisons +and it returns a list of sorted anonymous arrays. + + sort { + $a->[1] cmp $b->[1] + || + $a->[2] cmp $b->[2] + } + +Finally, we need to get back the original records which are in the first +slot of the anonymous array: + + map $_->[0] + +This is why the ST is known as a map/sort/map technique. + +=head2 C + +The Guttman-Rosler Transform (popularized by Uri Guttman and Larry +Rosler) uses a string to cache the extracted keys as well as either +the record or its index. It is also a map/sort/map technique but +because its cache is a string, it can be sorted without any Perl level +callback (the {} block passed to sort). This is a signifigant win +since that callback is running O( N log N). But this speedup comes at +a cost of complexity. You can't just join the keys into a string and +properly sort them. Each key may need to be processed so that it will +correctly sort in order and it doesn't interfere with other keys. That +is why the GRT has several key attributes to enable it to properly and +efficiently pack the sort keys into a single string. The following +lists the GRT key attributes, when you need them and what key +processing is done for each. Note that you can always enable the GRT +specific attributes as they are just ignored by the other sort styles. + +The GRT gains its speed by using a single byte string to cache all of +the extracted keys from a given input record. Packing keys into a +string such that it will lexically sort the correct way requires some +deep mojo and data munging. But that is why this module was written - +to hide all that from the coder. Below are descriptions of how the +various key types are packed and how to best use the GRT specific key +attributes. Note: you can only use one of the GRT number or string +attributes for any key. Setting more than one in either the defaults +or in any given key is an error (a key's attribute can override a +default choice). + +=head3 C + +The 'unsigned' Boolean attribute tells the GRT that this number key is a +non-negative integer. This allows the GRT to just pack it into 4 bytes +using the N format (network order - big endian). An integer packed this +way will have its most significant bytes compared before its least +signifigant bytes. This involves the least amount of key munging and so +it is the most efficient way to sort numbers in the GRT. + +If you want this key to sort in descending order, then the key value is +negated and normalized (see the 'signed' attribute) so there is no +advantage to using 'unsigned'. + +=head3 C + +The 'signed' Boolean attribute tells the GRT that this number key is +an integer. This allows the GRT to just pack it into 4 bytes using the +N format (network order - big endian). The key value must first be +normalized which will convert it to an unsigned integer but with the +same ordering as a signed integer. This is simply done by inverting +the sign (highest order) bit of the integer. As mentioned above, when +sorting this key in descending order, the GRT just negates the key +value. + +NOTE: In the GRT the signed and unsigned integer attributes only work +on perl built with 32 bit integers. This is due to using the N format +of pack which is specified to be 32 bits. A future version may support +64 bit integers (anyone want to help?). + +=head3 C + +The 'unsigned_float' Boolean attribute tells the GRT that this number +key is a non-negative floating point number. This allows the GRT to +pack it into 8 bytes using the 'd' format. A float packed this way +will have its most significant bytes compared before its least +signifigant bytes. + +=head3 C + +The C Boolean attribute (which is the default for all +number keys when using the GRT) tells the GRT that this number key is +a floating point number. This allows the GRT to pack it into 8 bytes +using the 'd' format. A float packed this way will have its most +significant bytes compared before its least signifigant bytes. When +processed this key will be normalized to an unsigned float similar to +to the C to C conversion mentioned above. + +NOTE: The GRT only works with floats that are in the IEEE format for +doubles. This includes most modern architectures including x86, sparc, +powerpc, mips, etc. If the cpu doesn't have IEEE floats you can either +use the integer attributes or select another sort style (all the +others have no restriction on float formats). + +=head3 simple string. + +If a string key is being sorted in ascending order with the GRT and it +doesn't have one of the GRT string attributes, it will be packed +without any munging and a null (0x0) byte will be appended to it. This +byte enables a shorter string to sort before longer ones that start +with the shorter string. + +NOTE: You cannot sort strings in descending order in the GRT unless +the key has either the 'fixed' or 'varying' attributes set. Also, if a +string is being sorted in ascending order but has any null (0x0) bytes +in it, the key must have one of those attributes set. + +=head3 C + +This value attribute tells the GRT to pack this key value as a fixed +length string. The extracted value will either be padded with null +(0x0) bytes or truncated to the specified length (the value of the +C attribute). This means it can be packed into the cache string +with no padding and no trailing null byte is needed. The key can +contain any data including null (0x0) bytes. Data munging +happens only if the key's sort order is descending. Then the key value is +xor'ed with a same length string of 0xff bytes. This toggles each bit +which allows for a lexical comparison but in the reverse order. This +same bit inversion is used for descending varying strings. + +=head3 C + +This Boolean attribute tells the GRT that this key value is a varying length +string and has no predetermined padding length. A prescan is done to +determine the maximum string length for this key and that is used as the +padding length. The rest is the same as with the 'fixed' attribute. + +=head2 C + +This sub (which can be exported) returns the source of a generated +sort sub or the source of the last one that had an error. To get the +source of an existing sort sub, pass it a reference to that sub (i.e. +the reference returned from make_sorter). To get the source for a +failed call to make_sorter, don't pass in any arguments. + + my $sorter = make_sorter( ... ) ; + print sorter_source( $sorter ) ; + + make_sorter( name => 'my_sorter', ... ) ; + print sorter_source( \&my_sorter ) ; + + my $sorter = make_sorter( ... ) + or die "make_sorter error: $@\n", sorter_source(); + +If all you want is the generated source you can just do: + + print sorter_source make_sorter( ... ) ; + +=head2 Error Handling + +When C detects an error (either bad arguments or when the +generated sorter won't compile), it returns undef and set $@ to an +error message. The error message will include the generated source and +compiler and warning errors if the sorter didn't compile correctly. +The test t/errors.t covers all the possible error messages. You can +also retrieve the generated source after a compiling error by calling +C. + +=head1 TESTS + +C uses a table of test configurations that can both run +tests and benchmarks. Each test script is mostly a table that +generates multiple versions of the sorters, generate sample data and +compares the sorter results with a sort that is known to be good. If +you run the scripts directly and with a -bench argument, then they +generate the same sorter subs and benchmark them. This design ensures +that benchmarks are running on correctly generated code and it makes +it very easy to add more test and benchmark variations. The code that +does all the work is in t/common.pl. Here is a typical test table +entry: + + { + skip => 0, + source => 0, + name => 'init_code', + gen => sub { rand_choice( @string_keys ) . ':' . + rand_choice( @number_keys ) }, + gold => sub { + ($a =~ /^(\w+)/)[0] cmp ($b =~ /^(\w+)/)[0] + || + ($a =~ /(\d+$)/)[0] <=> ($b =~ /(\d+$)/)[0] + }, + args => [ + init_code => 'my( $str, $num ) ;', + string => 'do{( $str, $num ) = /^(\w+):(\d+)$/; $str}', + number => '$num', + ], + }, + +C is a boolean that causes this test/benchmark to be skipped. +Setting C causes the sorter's source to be printed out. +C is a sub that generates a single input record. There are +support subs in t/common.pl that will generate random data. Some tests +have a C field which is fixed data for a test (instead of the +generated data). The field is a comparision subroutine usable +by the sort function. It is used to sort the test data into a golden +result which is used to compare against all the generated sorters. +C is an anonymous array of arguments for a sorter or a hash ref +with multiple named/args pairs. See t/io.t for an example of that. + +=head1 BENCHMARKS + +=head1 EXPORT + +This module always exports the C sub. It can also +optionally export C. + +=head1 BUGS + +Sort::Maker GRT currently works only with 32 bit integers due to pack +N format being exactly 32 bits. If someone with a 64 bit Perl wants to +work on using the Q format or the ! suffix and dealing with endian +issues, I will be glad to help and support it. It would be best if +there was a network (big endian) pack format for quads/longlongs but +that can be done similarly to how floats are packed now. + +=head1 AUTHOR + +Uri Guttman, Euri@stemsystems.comE + +=head1 ACKNOWLEDGEMENTS + +I would like to thank the inimitable Damian Conway for his help in the +API design, the POD, and for being a good Perl friend. + +And thanks to Boston.pm for the idea of allowing qr// for key +extraction code. + +=cut diff --git a/exp/bar1 b/exp/bar1 new file mode 100644 index 0000000..688d0ab --- /dev/null +++ b/exp/bar1 @@ -0,0 +1,281 @@ +my %is_boolean_opt = map { $_ => 1 } qw( + descending + no_case + ref_in + ref_out + GRT_refs +) ; + +my %is_value_opt = map { $_ => 1 } qw( + name + +) ; + +my @key_types = qw( + string + string_case + string_no_case + number + integer +) ; + +my $key_alt = join '|', @key_types ; +my $key_re = qr/^([+-])?($key_alt)$/ ; + +my %sort_makers = ( + + plain => \&make_plain_sort, + ST => \&make_ST_sort, + GRT => \&make_GRT_sort, +) ; + +my %sources ; +my $error_source ; + + +use Data::Dumper ; + +sub sorter_source { + + $sources{ +shift } || $error_source ; +} + +sort_maker() ; + + +sub sort_maker { + + my( $options, $keys ) = process_options( @ARGV ) ; + +print Dumper $options, $keys ; + + die "no keys specified" unless @{$keys} ; + +return ; + my $sort_maker = $sort_makers{ $options->{style} } ; + + die "no sort style selected" unless $sort_maker ; + + my $source = $sort_maker->( $keys ) ; + + my $sorter = eval $source ; + + die "bad source $@" if $@ ; + + $sources{ $sorter } = $source ; + + if ( my $name = $options->{name} ) { + + no strict 'refs' ; + + my $package = (caller())[0] ; + + *${"${package}::$name"} = $sorter ; + } + + return $sorter ; +} + +sub process_options { + + my( %options, @keys ) ; + + while( @_ ) { + + my $opt = shift ; + + if ( $sort_makers{ $opt } ) { + + if ( @keys ) { + die "no options after keys" + } + + if ( $options{ style } ) { + + die + "style was already set to $options{ style }" ; + } + + $options{ style } = $opt ; + $options{ $opt } = 1 ; + + next ; + } + + if ( $is_boolean_opt{ $opt } ) { + + if ( @keys ) { + die "no options after keys" + } + + $options{ $opt } = 1 ; + next ; + } + + if ( $is_value_opt{ $opt } ) { + + if ( @keys ) { + die "no value options after keys" + } + + unless( @_ ) { + + die "no value for option $opt" + } + + $options{ $opt } = shift ; + next ; + } + + if ( my( $order, $key ) = $opt =~ /$key_re/ ) { + + my $descend = ( $order || '' eq '+' ) ? 0 : + $options{ 'descending' } ; + + + my $no_case = ( $key =~ s/_(no_)case// ) ? + defined( $1 ) : $options{no_case} ; + + $key = 'number' if $key eq 'integer' && + !$options{GRT} ; + + my $code = $_[0] ; + + if ( defined( $code ) || ! $code =~ /$key_re/ ) { + + shift ; + } + else { + + $code = '$_' ; + } + + push( @keys, { + + key => $key, + no_case => $nocase, + descend => $descend, + code => $code, + } ) ; + + next ; + } + + die "unknown option or key '$opt'" ; + } + + return( \%options, \@keys ) ; +} + +__END__ + + +sub build_st { + + my( @st_compares, @st_extracts ) ; + my $st_ind = '1' ; + + foreach my $key ( @keys ) { + + my $st_compare = <<'CMP' ; + $a->[1] cmp $b->[1] +CMP + + $st_compare =~ tr/ab/ba/ if $key->{descending} ; + $st_compare =~ s/cmp/<=>/ if $key->{numeric} ; + $st_compare =~ s/1/$st_ind/g ; + + $st_ind++ ; + + push( @st_compares, $st_compare ) ; + + my $st_extract = <{code} } +EXT + + $st_extract =~ s/uc// if $key->{no_case} ; + + push( @st_extracts, $st_extract ) ; + } + + my $compare_source = join "||\n", @st_compares ; + + my $extract_source = join ",\n", @st_extracts ; + + my $source = <[0], + sort { +$compare_source + } + map [ $_, +$extract_source + ], @_ +} +SUB + +} + +sub build_plain { + + my( @plain_compares ) + + foreach my $key ( @keys ) { + + my $plain_compare = <<'CMP' ; + do{ my( \$left, \$right ) = map { $key->{code} } \$a, \$b; + \$left cmp \$right } +CMP + + $plain_compare =~ s/\$a, \$b/\$b, \$a/ if $key->{descending} ; + $plain_compare =~ s/cmp/<=>/ if $key->{numeric} ; + $plain_compare =~ s/=/= map uc,/ if $key->{no_case} ; + + push( @st_compares, $st_compare ) ; + + push( @st_extracts, $st_extract ) ; + } + + my $compare_source = join "||\n", @st_compares ; + + my $source = <{code} } +EXT + + $st_extract =~ s/uc// if $key->{no_case} ; + + push( @st_extracts, $st_extract ) ; + } + + my $extract_source = join ",\n", @st_extracts ; + + my $source = <[0], + sort + pack +$extract_source + , @_ ; +} +SUB + +} + + diff --git a/exp/bar2 b/exp/bar2 new file mode 100644 index 0000000..a8a1747 --- /dev/null +++ b/exp/bar2 @@ -0,0 +1,278 @@ +my %is_boolean_opt = map { $_ => 1 } qw( + descending + no_case + ref_in + ref_out + GRT_refs +) ; + +my %is_value_opt = map { $_ => 1 } qw( + name +) ; + +my %is_key = map { $_ => 1 } qw( + string + string + number + integer +) ; + +my %sort_makers = ( + + plain => \&make_plain_sort, + ST => \&make_ST_sort, + GRT => \&make_GRT_sort, +) ; + +my %sources ; +my $error_source ; + + +use Data::Dumper ; + +sub sorter_source { + + $sources{ +shift } || $error_source ; +} + +sort_maker() ; + + +sub sort_maker { + + my( $options, $keys ) = process_options( @ARGV ) ; + +print Dumper $options, $keys ; + + die "no keys specified" unless @{$keys} ; + + my $sort_maker = $sort_makers{ $options->{style} } ; + + die "no sort style selected" unless $sort_maker ; + + my $source = $sort_maker( $keys ) ; + + my $sorter = eval $source ; + + die "bad source $@" if $@ ; + + $sources{ $sorter } = $source ; + + if ( my $name = $options->{name} ) { + + no strict 'refs' ; + + my $package = (caller())[0] ; + + *${"${package}::$name"} = $sorter ; + } + + return $sorter ; +} + +sub process_options { + + my( %options, @keys ) ; + + while( @_ ) { + + my $opt = shift ; + + if ( $sort_makers{ $opt } ) { + + if ( @keys ) { + die "no options after keys" + } + + if ( $options{ style } ) { + + die + "style was already set to $options{ style }" ; + } + + $options{ style } = $opt ; + next ; + } + + if ( $is_boolean_opt{ $opt } ) { + + if ( @keys ) { + die "no options after keys" + } + + $options{ $opt } = 1 ; + next ; + } + + if ( $is_value_opt{ $opt } ) { + + if ( @keys ) { + die "no value options after keys" + } + + unless( @_ ) { + + die "no value for option $opt" + } + + $options{ $opt } = shift ; + next ; + } + + if ( my( $order, $key, $case ) = + $opt =~ /^([+-])?([a-z]+)(_\w+)?$/ ) { + + unless( $is_key{ $key } ) { + + die "unknown option '$opt'" ; + } + + my $descend = ( $order || '' eq '+' ) ? 0 : + $options{ 'descending' } ; + + my $nocase = $opt =~ s/_nocase$// || + $options{ nocase } ; + + $opt = 'numeric' if $opt eq 'integer' && + !$options{GRT} ; + + my $code = $_[0] ; + + unless ( defined( $code ) || + ( $code =~ /^([+-])?(\w+)$/ && $is_key{ $2 } ) ) { + + $code = '$_' ; + } + else { + shift ; + } + + push( @keys, { + + key => $key, + no_case => $nocase, + descend => $descend, + code => $code, + } ) ; + + next ; + } + + die "unknown option or key '$opt'" ; + } + + return( \%options, \@keys ) ; +} + +__END__ + + +sub build_st { + + my( @st_compares, @st_extracts ) ; + my $st_ind = '1' ; + + foreach my $key ( @keys ) { + + my $st_compare = <<'CMP' ; + $a->[1] cmp $b->[1] +CMP + + $st_compare =~ tr/ab/ba/ if $key->{descending} ; + $st_compare =~ s/cmp/<=>/ if $key->{numeric} ; + $st_compare =~ s/1/$st_ind/g ; + + $st_ind++ ; + + push( @st_compares, $st_compare ) ; + + my $st_extract = <{code} } +EXT + + $st_extract =~ s/uc// if $key->{no_case} ; + + push( @st_extracts, $st_extract ) ; + } + + my $compare_source = join "||\n", @st_compares ; + + my $extract_source = join ",\n", @st_extracts ; + + my $source = <[0], + sort { +$compare_source + } + map [ $_, +$extract_source + ], @_ +} +SUB + +} + +sub build_plain { + + my( @plain_compares ) + + foreach my $key ( @keys ) { + + my $plain_compare = <<'CMP' ; + do{ my( \$left, \$right ) = map { $key->{code} } \$a, \$b; + \$left cmp \$right } +CMP + + $plain_compare =~ s/\$a, \$b/\$b, \$a/ if $key->{descending} ; + $plain_compare =~ s/cmp/<=>/ if $key->{numeric} ; + $plain_compare =~ s/=/= map uc,/ if $key->{no_case} ; + + push( @st_compares, $st_compare ) ; + + push( @st_extracts, $st_extract ) ; + } + + my $compare_source = join "||\n", @st_compares ; + + my $source = <{code} } +EXT + + $st_extract =~ s/uc// if $key->{no_case} ; + + push( @st_extracts, $st_extract ) ; + } + + my $extract_source = join ",\n", @st_extracts ; + + my $source = <[0], + sort + pack +$extract_source + , @_ ; +} +SUB + +} + + diff --git a/exp/closure.pl b/exp/closure.pl new file mode 100644 index 0000000..b68ff05 --- /dev/null +++ b/exp/closure.pl @@ -0,0 +1,20 @@ +#!/usr/local/bin/perl + +use strict ; + +sub make { + my $foo = 3 ; + + my $sub = sub { print "sub called\n@_\n$foo\n", $foo + 1, "\n" } ; + +my $n = eval <<'CODE' ; + my $s = $sub ; + sub { $s->(@_) } +CODE + return $n ; +} + +my $m = make() ; + +$m->('bar') ; + diff --git a/exp/clpmodules_bench.pl b/exp/clpmodules_bench.pl new file mode 100644 index 0000000..741c1f2 --- /dev/null +++ b/exp/clpmodules_bench.pl @@ -0,0 +1,51 @@ +#!/usr/local/bin/perl + + use Sort::Maker; + use Benchmark 'cmpthese'; + + sub build_data { + my @r_order = + map { ('_', 0..9, 'A'..'Z', 'a'..'z')[rand 63] } 1..shift; + my @unsorted = + map qq|
|, + @r_order; + my @c_order; + push @c_order, splice( @r_order, rand @r_order, 1 ) + while @r_order; + join( '', @c_order ), @unsorted + } + + my ($cost_order, @unsorted) = build_data(2); + + my $sorter = make_sorter( + 'GRT', + init_code => "my \$cost_order = '$cost_order';", + signed => 1, + string_data => 1, + number => q{ /code="(.)"/ && index($cost_order,$1) }, + ); + + cmpthese( shift || -5, { + 'S-Maker' => sub { + my @sorted = $sorter->( @unsorted ); + }, + + 'S-Maker-compiling' => sub { + + my $sorter = make_sorter( + 'GRT', + init_code => "my \$cost_order = '$cost_order';", + signed => 1, + string_data => 1, + number => q{ /code="(.)"/ && index($cost_order,$1) }, + ); + my @sorted = $sorter->( @unsorted ); + }, + 'grep()' => sub { + my $co = $cost_order; + my @sorted = (); + while ( my $mc = chop $co ) { + unshift @sorted, grep /code="$mc"/, @unsorted; + } + }, + } ); diff --git a/exp/cmp.pl b/exp/cmp.pl new file mode 100644 index 0000000..c079d8e --- /dev/null +++ b/exp/cmp.pl @@ -0,0 +1,21 @@ +#!/usr/local/bin/perl + +use Benchmark ; + +my @in = map rand 50000, 1 .. 1000 ; + +timethese( shift || -2, { + + plain => sub { my @out = sort { $a <=> $b } @in }, + + slice => sub { my @out = sort { ($a)[0] <=> ($b)[0] } @in }, + + map => sub { my @out = sort { + (map {$_}$a)[0] <=> (map {$_}$b)[0] } @in }, + + temp => sub { my @out = sort { + my( $l, $r ) = map {$_} $a, $b; $l <=> $r } @in }, + +} ) ; + + diff --git a/exp/endian.pl b/exp/endian.pl new file mode 100644 index 0000000..6284cd4 --- /dev/null +++ b/exp/endian.pl @@ -0,0 +1,7 @@ + +use constant BIG_ENDIAN => pack('N', 1) eq pack('L', 1) ; +use constant FLOAT_LEN => length( pack( 'd', 1 ) ) ; +use constant FLOAT_PACK => BIG_ENDIAN ? + q{pack( 'd', $_ )} : q{reverse( pack( 'd', $_ ) )} ; + +print FLOAT_PACK, "\n" ; diff --git a/exp/endian2.pl b/exp/endian2.pl new file mode 100644 index 0000000..174d868 --- /dev/null +++ b/exp/endian2.pl @@ -0,0 +1,20 @@ +#!/usr/local/bin/perl + + +my $big_endian = + pack('N', 1) eq + pack('L', 1); + +print "BIG\n" if $big_endian ; + + + + sub float_sort ($) { + ($big_endian ? + pack 'd', $_[0] : + reverse pack 'd', $_[0]) ^ + ($_[0] < 0 ? "\xFF" x 8 : + "\x80" . "\x00" x 7) + } + + diff --git a/exp/float.pl b/exp/float.pl new file mode 100644 index 0000000..437386e --- /dev/null +++ b/exp/float.pl @@ -0,0 +1,35 @@ +#!/usr/local/bin/perl -w + +use strict ; + +use constant BIG_ENDIAN => pack('N', 1) eq pack('L', 1) ; +use constant FLOAT_LEN => length pack "d", 1 ; +use constant INT_LEN => length pack "N", 1 ; +use constant INT_BIT_LEN => INT_LEN * 8 ; +use constant FLOAT_PACK => BIG_ENDIAN ? + q{pack( 'd', $val )} : + q{reverse( pack( 'd', $val ) )} ; + +# xor masks to normalize negative and positive floats +my $xor_neg = "\xFF" x FLOAT_LEN ; + +# this only flips the sign bit (it gets extended by null bytes in the ^ op) +my $xor_pos = "\x80" ; + + + +sub foo { + my $rec_ind = 0 ; + + return @_[ + map unpack( 'N', substr( $_, -4 ) ), + sort + map pack( "A8N", + do{ my ($val) = $_ ; print "VAL [$val]\n" ; ( $val < 0 ? $xor_neg : $xor_pos ) ^ pack( 'd', $val ) }, + $rec_ind++ + ), @_ + ] ; +} + +foo( 1, 4, -2 ) ; + diff --git a/exp/map_context.pl b/exp/map_context.pl new file mode 100644 index 0000000..9c74a8b --- /dev/null +++ b/exp/map_context.pl @@ -0,0 +1,14 @@ +#!/usr/local/bin/perl -w + +use strict ; + +sub bar { + print "ARRAY\n" if wantarray ; + my @a = ( qw( a d g ) ) ; + + return @a ; +} + +my ($x) ||= map { bar() } 1 ; + +print "$x\n" ; diff --git a/exp/num_NA.pl b/exp/num_NA.pl new file mode 100644 index 0000000..96d04d6 --- /dev/null +++ b/exp/num_NA.pl @@ -0,0 +1,13 @@ +#!/usr/local/bin/perl + +use Sort::Maker ; + +my $sorter = make_sorter( 'GRT', number => '/\D/ ? 0 : $_' ) ; +print "$@" unless $sorter ; + + +print map "$_\n", $sorter->( qw( 22 1 10 NA 33 NA ) ) ; + + + + diff --git a/exp/pad_bench.pl b/exp/pad_bench.pl new file mode 100644 index 0000000..59927ba --- /dev/null +++ b/exp/pad_bench.pl @@ -0,0 +1,24 @@ +#!/usr/local/bin/perl + +use strict ; +use warnings ; + +use Benchmark ; + +my $len = 10 ; +use constant LEN => 10 ; +my $text = 'abc' ; + +my $pad = "\0" x $len ; + +timethese( shift || -2, { + + or => '$text | $pad', + xor => '$text ^ $pad', + padx => '$text . ("\0" x ( $len - length( $text ) ) )', + pad_substr => 'substr( $text . ("\0" x $len), 0, $len )', + pad_substr3 => 'substr( "$text$pad", 0, $len )', + padx2 => '$text . ("\0" x ( LEN - length( $text ) ) )', + pad_substr2 => 'substr( $text . ("\0" x LEN), 0, LEN )', + } +) ; diff --git a/exp/sort_rows.pl b/exp/sort_rows.pl new file mode 100644 index 0000000..6e8dbc7 --- /dev/null +++ b/exp/sort_rows.pl @@ -0,0 +1,34 @@ +#!/usr/local/bin/perl + +use strict ; +use warnings ; + +use Sort::Maker qw( make_sorter sorter_source ) ; + +my @unsorted = ( + + '
', + '
', + '
', + '
', +) ; + +my $cost_order = 'A2yB'; + +my $sorter = make_sorter( + 'GRT', +# init_code => "my \$cost_order = '$cost_order' ;", +# init_code => 'my $cost_order = q{A2yB};', + signed => 1, + string_data => 1, +# number => q{ /code="(.)"/ && index($cost_order,$1) }, + number => sub{ /code="(.)"/ && index($cost_order,$1) }, +) ; + +$sorter or die $@ ; + +print sorter_source( $sorter ) ; + +print map "$_\n", $sorter->( @unsorted ) ; + + diff --git a/exp/x_bench.pl b/exp/x_bench.pl new file mode 100644 index 0000000..02e674b --- /dev/null +++ b/exp/x_bench.pl @@ -0,0 +1,22 @@ +#!/usr/local/bin/perl + +#!/usr/local/bin/perl + +use strict ; +use warnings ; + +use Benchmark ; + +my $text = 'abcdffgjhjh' ; +my $pad = "\0" x 10 ; +my $pad2 = "\0" ; + +timethese( shift || -2, { + + long => '$text | "\0\0\0\0\0\0\0\0\0\0"', + short => '$text | "\0"', + const => '$text | ("\0" x 10)', + var => '$text | $pad', + var_short => '$text | $pad2', + } +) ; diff --git a/paper/abstract b/paper/abstract new file mode 100644 index 0000000..54573a6 --- /dev/null +++ b/paper/abstract @@ -0,0 +1,20 @@ +A Fresh Look at Efficient Perl Sorting + +Uri Guttman, John Porter, and Larry Rosler + +Abstract + +Sorting is one of the primary needs of programming, because most +programs produce reports of sorted data. Efficient sorting therefore +receives a great deal of attention in the Perl documentation, FAQ, +FMTYEWTK, books, and newsgroups. In this paper, we review and compare +several methods for Perl sorting: external ('system'), +repeatedly-computed-keys ('naive'), cached-keys ('Orcish Maneuver'), and +mapped-keys ('Schwartz Transform'). We then present a little-known +approach ('single-string key-and-value'), which relies for efficiency on +the default sort algorithm. This method typically provides much better +performance than the other approaches, and is easy to implement directly +or using a module. + + + diff --git a/paper/bench_loops.pl b/paper/bench_loops.pl new file mode 100644 index 0000000..f068fa5 --- /dev/null +++ b/paper/bench_loops.pl @@ -0,0 +1,28 @@ +#!/usr/local/bin/perl + +use Benchmark; + +use integer ; + +@list_100 = ( 1 .. 100 ) ; + +@list_10k = ( 1 .. 10_000 ) ; + +#@list_1m = ( 1 .. 1_000_000 ) ; + +timethese( 1 << 4, { + +# for_100 => sub { my @out; push( @out, $_ + 1 ) foreach @list_100 }, + +# map_100 => sub { my @out; @out = map $_ + 1, @list_100 }, + + for_10k => sub { my @out; push( @out, $_ + 1 ) foreach @list_10k }, + + map_10k => sub { my @out; @out = map $_ + 1, @list_10k }, + + for_10k_null => sub { my @out; push( @out, $_ ) foreach @list_10k }, + + map_10k_null => sub { my @out; @out = map $_, @list_10k }, + + } +) ; diff --git a/paper/field_sort.pl b/paper/field_sort.pl new file mode 100644 index 0000000..fbed1a7 --- /dev/null +++ b/paper/field_sort.pl @@ -0,0 +1,114 @@ +#!/usr/local/bin/perl + +use strict; + +# L. Rosler, January 10, 1999 +# Interface modeled after +http://www.effectiveperl.com/recipes/sorting.html +# by Joseph Hall. +# @sorted = fieldsort( ['optional split pattern as string',] +# [ anon array of specifiers (strings or optionally signed integers) +# including, in any order: +# optional '-' for reverse sort order, +# required integer for field number (1 = first), +# optional letter for field type: none = string; +# 'sN' = first N chars of string; 'n' = optionally signed integer; +# 'u' = unsigned integer; 'f' = optionally signed floating-point number +], +# @data); + +sub fieldsort { + my ($col, $cols, @len); + my $sep = !ref $_[0] && shift; + ref($cols = shift) eq 'ARRAY' or + die "Field specifiers must be in an anon array.\n"; + my @cols = map { /^-(\d+)$|^(\d+)-$/ ? $+ : () } @$cols; + if (@cols) { # Preprocess variable-width negated string sorts. + @len = (0) x @$cols; + my $split = sub { length $sep ? split m{$sep} : split }; + foreach (@_) { + my @a = $split->(); + foreach $col (@cols) { + $len[$col - 1] = length $a[$col - 1] + if $len[$col - 1] < length $a[$col - 1]; + } + } + } + my ($fixed, @packcode, @packargs) = 0; + for (@$cols) { + my ($col) = /(\d+)/ or die "No field specifier in $_.\n"; + my ($let) = /([a-z])/; + unless ($let) { # variable-length string + push @packcode, 'A*x'; + push @packargs, + "\$a[$col - 1]" . (/-/ && qq{ ^ "\\xFF" x $len[$col - +1]}); + $fixed = -1; + } elsif ($let eq 's') { # fixed-length string or substr + my ($len) = /(\d+)$/ or die "No length specified for field +$col.\n"; + push @packcode, "A$len"; + push @packargs, "\$a[$col - 1]" . (/-/ && qq{ ^ "\\xFF" x +$len}); + $fixed += $len if $fixed > -1; + } elsif ($let eq 'u') { # unsigned integer + push @packcode, 'N'; + push @packargs, (/-/ && '-') . "\$a[$col - 1]"; + $fixed += 4 if $fixed > -1; + } elsif ($let eq 'n') { # signed integer + push @packcode, 'N'; + push @packargs, (/-/ && '-') . "\$a[$col - 1] ^ (1 << 31)"; + $fixed += 4 if $fixed > -1; + } elsif ($let eq 'f') { # floating-point number + push @packcode, 'A8'; + push @packargs, '_float_sort(' . (/-/ && '-') . "\$a[$col - +1])"; + $fixed += 8 if $fixed > -1; + } else { die "Unrecognized type specifier $let in field $col.\n" +} + } + eval 'map substr($_, ' . ($fixed < 0 ? '1 + rindex $_, "\0"' : +$fixed) . + '), sort map { my @a = split' . (length $sep > 0 && " m{$sep}o") +. + ";\npack '" . join("", @packcode) . + ($fixed < 0 && substr($packcode[-1], -1) ne 'x' && 'x') . + join(",\n", "A*'", @packargs, '$_') . '} @_' +} + +BEGIN { + my $big_endian = pack('N', 1) eq pack('L', 1); + sub _float_sort { + (($big_endian ? pack 'd', $_[0] : reverse pack 'd', $_[0]) ^ + ($_[0] < 0 ? "\xFF" x 8 : "\x80" . "\x00" x 7)) . $_[0] + } +} + + +# Example usage: + +my (@data, @sorted); +@data = ('abc 2 zzz', 'abc 10 zzz', 'def 2 zzz'); +print q{@sorted = fieldsort [2, 1], @data}, "\n"; +@sorted = fieldsort [2, 1], @data; +{ local $" = '|'; print "@sorted\n"; } +@data = qw(abc+2+zzz abc+10+zzz abc+2+aaxx abc+2+ab def+2+zzz +abc+-1+zzz); +print q{@sorted = fieldsort '\\+', ['3s2', '-2n', -1], @data}, "\n"; +@sorted = fieldsort '\\+', ['3s2', '-2n', -1], @data; +print "@sorted\n"; +@data = qw(abc:2:zzz abc:10:zzz abc:2:aaaa def:2:zzz abc:-1:zzz); +print q{@sorted = fieldsort ':', ['2n', -1], @data}, "\n"; +@sorted = fieldsort ':', ['2n', -1], @data; +print "@sorted\n"; +print q{@sorted = fieldsort ':', ['-2f', -1, '3-'], @data}, "\n"; +@sorted = fieldsort ':', ['-2f', -1, '3-'], @data; +print "@sorted\n"; +print q{@sorted = fieldsort ':', ['-2n', '-1s3'], @data}, "\n"; +@sorted = fieldsort ':', ['-2n', '-1s3'], @data; +print "@sorted\n"; +print q{@sorted = fieldsort ':', ['-2n', -1], @data}, "\n"; +@sorted = fieldsort ':', ['-2n', -1], @data; +print "@sorted\n"; + + diff --git a/paper/make_data.pl b/paper/make_data.pl new file mode 100755 index 0000000..0be7ed4 --- /dev/null +++ b/paper/make_data.pl @@ -0,0 +1,110 @@ +#!/usr/local/bin/perl + + +@data = rand_ints( 10, 4, 8 ) ; + +print "@data\n" ; + + +@data = rand_padded_ints( 10, 4, 8 ) ; + +print "@data\n" ; + +@data = rand_deal( 10, 4 .. 8 ) ; + +print "@data\n" ; + + +@data = rand_deal( 10, 'aa0' .. 'zz9' ) ; + +print "@data\n" ; + +@data = rand_picks( 10, 'aa0' ) ; + +print "@data\n" ; + + + +sub rand_picks { + + my( $count, $pattern ) = @_ ; + + my( @rand, @chars, $char, $pick, @digits, @alphas ) ; + + @chars = split( '', $pattern ) ; + + @digits = ( '0' .. '9' ) ; + @alphas = ( 'a' .. 'z', 'A' .. 'Z' ) ; + + while( $count-- ) { + + $pick = '' ; + + foreach $char ( @chars ) { + + if ( $char eq 'a' ) { + + $pick .= $alphas[ rand 52 ] ; + next ; + } + + if ( $char eq '0' ) { + + $pick .= $digits[ rand 10 ] ; + next ; + } + } + + push( @rand, $pick ) ; + } + + @rand ; +} + +sub rand_deal { + + my( $count, @deck ) = @_ ; + + my( @rand ) ; + + push( @rand, $deck[ rand( @deck ) ] ) while $count-- ; + + @rand ; +} + + + +sub rand_ints { + + my( $count, $base, $range ) = @_ ; + + my( @rand ) ; + + push( @rand, int( rand( $range ) + $base ) ) while $count-- ; + + @rand ; +} + + + +sub rand_padded_ints { + + my( $count, $base, $range ) = @_ ; + + my( @rand, $pad ) ; + + $pad = length( $base + $range ) ; + + push( @rand, sprintf( "%0${pad}d", int( rand( $range ) + $base ) ) ) + while $count-- ; + + @rand ; +} + + + + + +#splice( @data, int( rand( @data + 1 ) ), 0, $base++ ) while $cnt-- ; + +#print "@data\n" ; diff --git a/paper/outline_notes b/paper/outline_notes new file mode 100644 index 0000000..41eea92 --- /dev/null +++ b/paper/outline_notes @@ -0,0 +1,1133 @@ +Return-Path: +From: John Porter +Subject: Re: sort article outline +To: uri@sysarch.com +Date: Mon, 28 Dec 1998 11:46:32 -0500 (EST) +Cc: lr@hpl.hp.com +In-Reply-To: <199812280441.XAA11943@home.sysarch.com.> from "Uri Guttman" at Dec 27, 98 11:41:54 pm +Content-Type: text/plain; charset=US-ASCII +Content-Length: 1680 + +> JP> Still linear. Maybe the constants are painfully large, but we're +> JP> still looking at O(n). +> +> i think i see why we don't agree here. i am talking about the linear set +> of bytes and you are talking about liner O(n) for the work. is that it? + +Yep. + + +> >> hey, it's english. +> +> JP> Marginally. +> +> ??? marginal quality english or marginally in english? + +The latter. My point is that wrt computer jargon, esp. Perl jargon, +english is not much of a standard. I.e. what "string" means in +english has only marginal bearing on what it means in Perl. + + +> different names. each has advantages. the ascii version can use short +> integer strings (<4 bytes) and is always endian independent. you could +> pack into a byte or short if you want to. binary can +> be more compact for some data formats (floats in particular) but endian +> issues come up and have to be dealt with. +> +> you can cover the concept as one but the conversion process is different +> as one uses sprintf and one uses pack. though the sprintf could use pack +> is some ways. the sprintf is used to zero pad integers and print floats +> in a sortable format. pack is used to store the integers in a byte order +> that cmp can sort. + +O.k, I think I see what you're getting at. "Stringify" (in Perl) has +always refered to the conversion which occurs transparently when a +non-string SV gets used as a string. And more generally, the same +effect can be achieved by sprintf. The result is a string which, when +printed, represents the non-string value of the SV (be it integer, float, +reference, whatever). + +So maybe the right way to describe a string which is produced by pack +is "packed". + +John + + + + +Return-Path: +From: John Porter +Subject: Re: sort article outline +To: uri@sysarch.com +Date: Sun, 27 Dec 1998 18:15:36 -0500 (EST) +In-Reply-To: <199812262117.QAA09392@sysarch.com.> from "Uri Guttman" at Dec 26, 98 04:17:34 pm +Content-Type: text/plain; charset=US-ASCII +Content-Length: 3189 + + +Uri wrote: +> not if you are packing integers and floats with optional endian +> reversals. the whole concept is to convert various data types to one +> concatenated set of keys sortable by cmp. cmp can't handle ints and +> floats normally. larry showed how to handle signed ints and floats with +> endian issues covered. so there is a conversion to a linear set of bytes +> sortable by cmp. the original "disparate set of keys" has been mapped to +> a single sortable key. + +Still linear. Maybe the constants are painfully large, but we're still +looking at O(n). + + +> JP> Why should we pander to the lcd? +> +> i don't think it is pandering at all. all i think is one form is a +> subset of the other form and should have a different name. + +Well clearly, printable characters are a subset (variously defined) +of the set of octets. My point is that Perl doesn't care, and +Perl programmers shouldn't care. In Perl terminology, a string is +the octet buffer part of a SV. Printability is irrelevant. + + +> there are two issues we are arguing +> here, whether there should be two names and what those names are. + +Yep. + +> JP> This is where you and I fundamentally disagree, I guess. +> +> hey, it's english. + +Marginally. + +> JP> Nothing wrong with that. The raw printability of the characters +> JP> on any given terminal is irrelevant, imho. +> +> i disagree. some may choose to use the printable form for that very +> attribute. sometime the converted keys may be kept around and used for +> other long term purposes where printability may be a needed attribute. + +That's why this sorting issue is so open-ended. We can't lay down +some hard-and-fast technique (like the ST). If the printability is +critical, then the programmer has to work within that constraint. +But if the performance of the sort is of primary concern, then she +may have to sacrifice the intrinsic printability of the comparison +keys; then they can do a transformation back into printable form, +or cache the printable form, or whatever. + + +> by having 2 names for 2 subtly different packing methods i think +> that helps them decide which to choose. + +I'm all for educating the programmer on the issues and choices. + + +> JP> I think this is a bad misuse of the term "linear". +> JP> Maybe you could come up with some other term that expresses +> JP> that notion. +> +> well, i agree the term may not be the best, but do you agree on the +> difference between the methods, as i hve described above? + +Clearly not. + + +> JP> You and I don't see eye-to-eye on this. +> +> maybe my argument above will convince you more. or i can get your +> mother-in-law to agree with me and then you will be sorry! + +Um, unfortunately, you have only clarified your position. +You have not added any persuasive content. + + +> JP> I remember he said something like that about the ST specifically; +> JP> don't remember anything about Perl generally... +> +> maybe my memory was faulty but he was so negative on others that i +> remember a slam on perl in some letters. it would be tricky to scan him +> on dejanews for just that. + +I'm not saying I don't believe he said such things; I just don't +remember them. Knowing Rutka, he probably did! + +John + + + + +Return-Path: +From: John Porter +Subject: Re: sort article outline +To: uri@sysarch.com +Date: Sat, 26 Dec 1998 11:30:34 -0500 (EST) +Cc: lr@hpl.hp.com +In-Reply-To: <199812260558.AAA08301@sysarch.com.> from "Uri Guttman" at Dec 26, 98 00:58:39 am +Content-Type: text/plain; charset=US-ASCII +Content-Length: 2668 + + +Uri wrote: +> >>>>> "JP" == John Porter writes: +> +> >> also if you +> >> put integers in the perl byte string, then it is not stringifying which +> >> slightly implies ascii. +> +> JP> Not in Perl, imho. +> JP> I think cmp implies strings, regardless of the character value range. +> JP> We're talking about comparing arrays of byte-sized "characters", +> JP> i.e. strings. +> +> >> so linearization is a better overall term, +> +> JP> I find your argument, such as it is, unconvincing. +> JP> In what way has the process made anything linear? +> JP> That's what "linearization" means, after all. +> +> you are taking a disparate set of keys and making them into a single key +> of a linear set of bytes. + +Even so, the "disparate set of keys" is already linear, wrt +the sorting algorithm. At least if it's done right, which +presumably it is. + + +> i understand that perl is 8 bit clean regarding strings and cmp. and cmp +> is an unsigned compare (IIR). but i (and some others) feel that the word +> string connotes some printable attributes. + +Anyone harboring that notion should be edified. Why should we +pander to the lcd? + + +> >> (ascii really means printing chars, and probably just digits and +> >> us-ascii unless you use locale and it works for extendied ascii). +> +> JP> Um, imho, all this stuff about ascii is irrelevant. +> +> well how would you differentiate converting a key to printable ascii +> vs. integer bytes? + +The important question is, *why* would you? +I don't think we should, in this case at least. + + +> stringifying implies printable. + +This is where you and I fundamentally disagree, I guess. + +> but if you create a 8 bit set of bytes that would be something +> more complex that won't print (without going to hex or something). + +Nothing wrong with that. The raw printability of the characters +on any given terminal is irrelevant, imho. + + +> that +> is linearization since a set of integers and floats, are now a linear +> set of sortable bytes, but not a printable string. + +I think this is a bad misuse of the term "linear". +Maybe you could come up with some other term that expresses +that notion. + + +> this is fine edge semantics, but i do like having two terms for the +> printable and non-printable converted keys. maybe linearization is not a +> great term but i want to reserve stringifying for printable only. + +Whatever. You and I don't see eye-to-eye on this. + + +> i recall [Rutka] had some early misstatements about not liking perl +> (or we interpreted his poor wording to mean that). + +I remember he said something like that about the ST specifically; +don't remember anything about Perl generally... + +John + + + + +Return-Path: +From: John Porter +Subject: Sorting article +To: uri@sysarch.com +Date: Sat, 2 Jan 1999 23:45:47 -0500 (EST) +Cc: lr@hpl.hp.com +In-Reply-To: <199901021831.NAA12831@home.sysarch.com.> from "Uri Guttman" at Jan 2, 99 01:31:45 pm +Content-Type: text/plain; charset=US-ASCII +Content-Length: 1354 + + +> John wrote: +> > Uri wrote: +> > > in both cases, the original data is usually concatenated onto the end of +> > > the created sort key and then extracted (usually with substr) after the +> > > sort is done. neither string sort can handle perl references directly, +> > > though the referenced data can be used to create a single sort key and +> > > then ST or OM can then be used for the actual sort. +> > +> > I don't think I'd say "usually". You should say "possibly", with +> > a description of the situations that allow it. +> +> fine. what situations exist where you would not need to concatenate the +> original data onto the key? + +I'm not talking not needing to do it, I'm talking about not being able +to do it. If you're sorting a list of "objects" e.g., you can't simply +concat a reference onto the end of a string. However, as you pointed out, +one could simply concat an index for the object; so now I can't think of +any situations where the pack technique can't be used. + +The slice of the pie I'd like to tackle is addressing certain +issues, namely: +1. Comparison keys are unique, slightly non-unique, or highly non-unique; +2. Cost of deriving the comparison key from each datum is nil, small, + or large. +According to my benchmarks, these had a significant influence on the +relative performance of various techniques (ST, OM, etc.). + +John + + + + +Return-Path: +From: John Porter +Subject: Re: Sorting article +To: uri@sysarch.com +Date: Mon, 4 Jan 1999 08:59:18 -0500 (EST) +Cc: lr@hpl.hp.com +In-Reply-To: <199901030455.XAA13708@home.sysarch.com.> from "Uri Guttman" at Jan 2, 99 11:55:35 pm +Content-Type: text/plain; charset=US-ASCII +Content-Length: 1306 + +Uri wrote: +> good. i like the point you make about the index vs. the reference. this +> could be a sort of combination of swchartzian and stringify (either +> flavor). if you have to sort a list of refs (or complex stuff), +> preprocess into a array with a single ref to the real data, make a list +> of stringified keys and append the index to the key and sort with +> builtin cmp. then extract out the indexes and original ref(s). the +> schwartzian part is doing it with maps. + +The similary to ST had occured to me. + map { (split("\0",$_,2))[0] } + sort + map { join "\0", $_->cmpkey(), $_ } + @a + +But this is as close as it gets. The other "advantage" of ST -- +multiple key comparisons -- is obviated by our technique. + + +> tho i beleive temp arrays and +> foreach modifiers can be or are faster than maps. we have seen cases +> where multiple maps ar slow. i think it is because it has to recopy all +> the values for each map. a foreach on a temp array would use the same +> array without copying. + +I thought the slow part was all those anonymous arrays which are +created in the first map. map itself doesn't allocate an AV, it makes +a list. Same with sort. So if you're sorting a list of 1000 items, +you have three lists created (map, sort, map), but 1000 anonymous AV's +allocated in the first map. + +John + + + + +Return-Path: +From: Larry Rosler +To: "'Uri Guttman'" , jdporter@min.net +Subject: RE: Sorting article +Date: Mon, 4 Jan 1999 12:55:54 -0800 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Length: 4272 + +> From: Uri Guttman [mailto:uri@sysarch.com] +> Sent: Monday, January 04, 1999 08:23 +> To: jdporter@min.net +> Cc: lr@hpl.hp.com +> Subject: Re: Sorting article +... +> P.S. larry has some catchup reading and replying to do now that he +> should be back from vacation. + +Yes, I am back. We drove 7 hours yesterday from Southern California, +much of it through some heavy fog in the Central Valley, so I am still a +little bleary. And I have tried to plow through over 1200 articles in +c.l.p.misc, with so far only two conclusions: + +1. The group got along fine without me (something to think about for +the future :-). + +2. Uri seems to have had a couple of really bad days through the past +week or so. :-( + +I haven't had a chance to read all of the correspondence between you two +yet, but I did skim a couple, and noticed the mention of sorting +indexes. I "solved" the problem of using the default sort on references +while on vacation, *literally* while I was sleeping (i.e., I woke up and +wrote it down!). I then generalized it some, and thought it would be a +world-beater speedwise. But my first measurements are somewhat +disappointing. + +Here is what I have so far; I'm sharing it before doing much else +because your comments and observations would be most useful. It is +actually a different sorting paradigm, as you will see: + +EVOLUTION OF EFFICIENT PERL SORTS + +Naive { sortkey(value_a) cmp sortkey(value_b) } +Orcish $h{ value } ||= sortkey OR @h{ values } = sortkeys +Schwartz [ value, sortkey, ... ] +Linearized sortkey . value +New @h{ sortkeys } = values + + + +#!/usr/local/bin/perl -w +use Benchmark; + +my $n = 10000; +my @a = map { [ int(rand 1000), 'foo' . int(rand 1000) ] } 1 .. $n; + +sub new { + my $i = 'aaa'; + my %h; + @h{ map pack('N A* A*' => $_->[0], $_->[1], $i++) => @a } = @a; + my @out = @h{ sort keys %h }; +} + +sub usual { + my @out = sort { $a->[0] <=> $b->[0] || $a->[1] cmp $b->[1] } @a; +} + +timethese(1 << (shift || 0), { + New => sub { &new }, + Usual => sub { &usual }, +}); +__END__ + +Benchmark: timing 16 iterations of New, Usual... + New: 10 wallclock secs ( 9.56 usr + 0.14 sys = 9.70 CPU) + Usual: 13 wallclock secs (12.92 usr + 0.00 sys = 12.92 CPU) + +Comments: + +$i serves two functions: insuring the uniqueness of the sort key (which +is saved as the key to a hash, so must be unique); preserving the +stability of the sort (entries that sort equal preserve their order in +the output array). + +Do you like those two hash slices, Uri? Take a look at the next +attempt, which adds an array slice. + +In the above case, storing the references themselves as values in the +hash is efficient. But I was concerned about generalizing this sorting +paradigm to values that were not references, but might be arbitrarily +long strings. Before I left on vacation, someone had posted about +perhaps sorting references to the strings, or indexes into the array. +So I tried the following: + +#!/usr/local/bin/perl -w +use Benchmark; + +my $n = 10000; +my @a = map { sprintf "XXX %d %s %s", int(rand 1000), 'foo' . int(rand +1000), + 'x' x 100 . "\n" } 1 .. $n; + +sub indexed { + my $i = 'aaa'; + my %h; + @h{ map pack('N A* A*' => /(\d+) (\w+)/, $i++) => @a } = 0 .. $#a; + my @out = @a[ @h{ sort keys %h } ]; +} + +sub linear { + my @out = map { substr($_, 1 + rindex $_, "\0") } sort + map pack('N A* x A*' => /(\d+) (\w+)/, $_) => @a; +} + +timethese(1 << (shift || 0), { + Indexed => sub { &indexed }, + Linear => sub { &linear }, +}); +__END__ + +Benchmark: timing 16 iterations of Indexed, Linear... + Indexed: 15 wallclock secs (14.06 usr + 0.58 sys = 14.64 CPU) + Linear: 13 wallclock secs (12.44 usr + 0.61 sys = 13.05 CPU) + +Comments: + +I had hoped that this would be better, because it seemingly avoids +appending the data to the sortkeys and then extracting via the substrs. +But both of those operations take place in the O(n) part of the sort, +not in the O(n log n). So perhaps I was expecting too much. But why is +it *slower*??? + +I have to catch up now on the rest of the world, including my *real* +job. But I wanted to get this off my chest (and onto yours? :-) ASAP. + +HNY!!! + +Larry + +-- +Larry Rosler +Hewlett-Packard Company +http://www.hpl.hp.com/personal/Larry_Rosler/ +lr@hpl.hp.com + + + +From: Larry Rosler +To: "'John Porter'" +Cc: uri@sysarch.com +Subject: RE: Sorting article +Date: Wed, 6 Jan 1999 13:27:16 -0800 +Content-Type: text/plain +Content-Length: 1096 + +> From: John Porter [mailto:jdporter@min.net] +> Sent: Wednesday, January 06, 1999 13:02 +> To: lr@hplb.hpl.hp.com +> Cc: uri@sysarch.com +> Subject: Re: Sorting article +> +> +> Larry wrote: +> > New @h{ sortkeys } = values +> >... +> > @h{ map pack('N A* A*' => $_->[0], $_->[1], $i++) => @a } = @a; +> > my @out = @h{ sort keys %h }; +> +> This actually reminds me of a technique I first learned about from +> Joseph Hall, see EPP, the bottom of p. 47, where he does something +> similar with an array. He calls it a "mundane" way to sort. ! +> +> John + +I think you clipped the wrong piece from my letter. Hall is +demonstrating the sort-the-indexes sort, but his sort function uses the +'mundane' key comparisons (from an array instead of a hash). I was +trying to combine the index sort with the default sort. This is the +right clip: + +sub indexed { + my $i = 'aaa'; + my %h; + @h{ map pack('N A* A*' => /(\d+) (\w+)/, $i++) => @a } = 0 .. $#a; + my @out = @a[ @h{ sort keys %h } ]; +} + +The nesting of the hash slice inside of the array slice is not +'mundane', is it? + +Larry + + + +From: Larry Rosler +To: "'Uri Guttman'" , jdporter@min.net +Subject: RE: sort article outline +Date: Thu, 7 Jan 1999 23:05:27 -0800 +Content-Type: text/plain +Content-Length: 1258 + +Hi. I'm still way behind on stuff, especially this work. But here is a +thought on terminology. + +Rather than 'stringified sort' or 'linearized sort', how about +'lexicographic sort'? If that isn't definitive enough, consider 'pure +lexicographic sort' or 'default lexicographic sort'. + +Which reminds me: Whoever now owns the draft of the paper (Uri?) -- +make sure to include a note that all this requires that 'no locale;' be +in effect, or the idea falls apart. This is from perllocale: + + + no locale; + print +(sort grep /\w/, map { chr() } 0..255), "\n"; + +This machine-native collation (which is what you get unless use locale +has appeared earlier in the same block) must be used for sorting raw +binary data, whereas the locale-dependent collation of the first example +is useful for natural text. + + +Note the 'sorting raw binary data' which is what we are talking about. + +I dislike 'stringified' or 'stringized' for several reasons, not the +least of which is that verbification of adjectives isn't good English, +IMO. 'Linearized' (also a verbification of an adjective, but with a +better pedigree) may be a little off-beat semantically. I think +'lexicographic' comes from the C Standard -- I'll take a look tomorrow. + +Larry + + + +From: Larry Rosler +To: "'Uri Guttman'" +Subject: RE: sort article outline +Date: Fri, 8 Jan 1999 10:36:21 -0800 +Content-Type: text/plain; + charset="iso-8859-1" +Content-Length: 7713 + +> From: Uri Guttman [mailto:uri@sysarch.com] +> Sent: Friday, January 08, 1999 10:07 +> To: lr@hpl.hp.com +> Cc: jdporter@min.net +> Subject: Re: sort article outline +> +> larry, +> +> i seemed to have misplaced (lost) your comments to my outline. do you +> have that letter and can you forward it to me? i suspect you keep all +> you old email around. you seem that type. i delete most of mine. i +will +> be switching to vm on emacs which has folders and stuff and it might +> improve that aspect of things. +> +> uri + +All it costs is disk space, now on someone else's server (because I have +switched from running elm on my Unix box to using Microsoft Exchange, +which the rest of my group uses). + +The letter is below, and I have not re-edited it. But note some things: +the use of 'lexicographic' (as in my most recent suggeston) and the +mention of 'index sorts' which have since received more discussion +between me and John. + +Larry + +> From: Uri Guttman [mailto:uri@sysarch.com] +> Sent: Tuesday, December 22, 1998 21:47 +> To: jdporter@min.net; lr@hpl.hp.com +> Subject: sort article outline +> +> hi guys, +> +> i felt productive tonight and i didn't have other work and news was +slow +> so i wrote an outline for the sort article. i like it so far and i +think +> it will make for a good article (or 2). + +Likely the latter, if it is thorough (as the outline seems to be). + +> so please shred it up and send your comments. when we agree on a +> (semi-) final outline, then we can start fleshing out the paragraphs +> with text and code and benchmarks. +> +> an open area is benchmarking and describing when to use each of the +> speedup tricks. this will need some research and playing with code and +> data. + +'When to use' is what the summary is about. See below. + +> we should have a set of benchmark data we create to drive this so we +can +> all use it and cme up with useful benchmarks. having this stuff on a +> website (maybe MJD could help us) and having others playing with our +> stuff could be an intersting idea. +> +> otherwise, a belated happy hannukah to larry (we threw a sitdown +dinner +> party for 14 on sunday, the eigth night. we had 8 menorahs going!), +and +> happy holidays to you john. + +Nice image (8 menorahs * 9 candles each). And to you both. + +> uri +> +> +> Outline for TPJ Sort Article +> +> (Note the use of caps here :-) +> +> I. Overview of sorting in the real world. + +Refs. Knuth Vol. 3, Chap. 5 +FMTYEWTK + +Introduce O() concept and notation briefly + +> A. What is sorting +> +> B. Why is it needed + +Any program that produces tabular output is almost certain to need it! + +> C. Sort algorithms +> +> 1. bubble + +O(n**2) + +> 2. insertion + +O(n**2) + +> 3. tree + +O(n log n) + +> 4. shell + +O(n log n) + +> 5. quicksort + +O(n log n) + +> 6. others +> +> D. O(n) order function + +Is this later than needed, in order to characterize the sort +algorithms mentioned above? + +> 1. What it means +> +> 2. Constant and linear overhead for pre- and post- +> processing +> +> 3. Constant overhead per compare +> +> 4. Why reducing compare count is critical +> +> 5. Why amount of sort data matters +> +> II. Perl's sort function + +Implementation -- C qsort() with optional callbacks to a Perl function? +Limitations of Perl internal sorting -- size of data set, w/o +sort-merge. +Usefulness of highly tuned system sort command or commercial +sort packages (e.g., SyncSort) + +> A. Builtin compare is cmp + +and implementation is optimal (C or assembly memcmp function) + +> B. Custom compare code + +(in callback function) + +> 1. block of code +> +> 2. sub name +> +> 3. sub type glob +> +> 4. sub ref (to be done) +> +> C. Special vars $a and $b in compare +> +> III Ways to sort in Perl + +Ascending alphabetic (lexicographic) sort (default) + +> A. Numeric sort +> +> B. Reverse sort +> +> C. Multi-level sort + +Are these 'field' sorts? + +> D. Array index sort + +This is a derived-key sort. + +> E. Hash value sort + +This is a derived-key sort. + +> F. Complex compare code + +No single-valued sort key per datum. Beyond the scope of +this article. + +> IV. Efficient sorting in Perl +> +> A. When efficiency is not warranted + ^ an investment in improving + +> 1. Short data sets +> +> 2. Rarely executed program + +More generally, low cost compared to other parts of the +program. Quote from Rob Pike about when to optimize (don't!). + +> B. Sort overhead +> +> 1. builtin vs. {$a cmp $b} + +Trivial sorts (key is trivially derived from datum): +a. {$a <=> $b} or reverse +b. {$hash{$a} cmp $hash{$b}} or numeric; or reverse +c. {$a->[CONST] cmp $b->[CONST]} or numeric; or reverse + +> C. General ideas on how to speed up sort +> +> 1. Preprocessing data +to achieve a trivial sort + +> 2. Simple and fast compare code (builtin if possible) + +Trivial sort. + +> 3. Postprocessing data from LOL (if needed) +> +> D. Speed up techniques +> +> 1. Schwartzian transform (from Randal Schwartz) +> +> a. Converts input to LOL, index sort, convert LOL to output + +index sort? This is Trivial Sort c (see above). + +> b. uses maps or temp arrays +> +> 2. Orcish (Or-cache) manoever (from Joseph Hall) +> +> a. Converts sort data in compare code +> +> b. Converts only once per unique sort data + +but checks every time to see if key derivation is needed. + +> c. Stores converted keys in private hash + +This is Trivial Sort b (see above). + +> d. No need for post processing, as sort is on real data + +Where is the discussion about creating the hash during a +pre-processing pass, obviating the test on each comparand? + +> 3. Stringifying (need better name) (from many people) + +How about 'Linearized Sort'? + +> a. Preprocessing of input data to sortable strings + +'string' == arbitrary sequence of bytes, stored in a Perl +scalar variable (u.e., linear). + +> i. uses either pack or sprintf (or other perl code). +> +> ii. Converted key is combination of binary and ascii + ^^ may be + +> iii. Original data is appended onto converted key + +See below -- cannot be references. + +> b. Uses builtin compare (fastest) +> +> c. Postprocess keys to get original data (use substr +or unpack) + +Note that this is isomorphic to the Schwartz Transform +map/sort/map paradigm. + +> d. Can handle integers (signed or unsigned), ascii +strings, and + ^^^^^ +??? see above +> normal or reverse sorts on a per key basis. +> +> e. Can handle floats (IEEE format only) with code to +deal with big + ^ non-negative, until someone gets smarter! +> vs. little endian. + +Cannot handle references! + + 4. Summary + + a. Examples + IP addresses + Fully-qualified domain names (right to left) + ... + + b. When to use + Criteria + Benchmarks + +> +> -- +> Uri Guttman ----------------- SYStems ARCHitecture and +Software Engineering +> Perl Hacker for Hire ---------------------- Perl, +Internet, UNIX Consulting +> uri@sysarch.com ------------------------------------ +http://www.sysarch.com +The Best Search Engine on the Net ------------- +http://www.northernlight.com + +Good start. Excelsior! + +I am having trouble minding my tongue about Rutka's obnoxious posts, but +he won't disappear (and he now works for Lucent, my alma mater. +Sigh...) + +I will be away from December 25 through January 3; real vacation (no +electronic communication). + +Happy New Year!!! + +Larry + +-- +Larry Rosler +Hewlett-Packard Company +http://www.hpl.hp.com/personal/Larry_Rosler/ +lr@hpl.hp.com + + + +From: Larry Rosler +To: "'Uri Guttman'" +Cc: jdporter@min.net +Subject: RE: sort article outline +Date: Fri, 8 Jan 1999 12:32:49 -0800 +Content-Type: text/plain +Content-Length: 2465 + +> From: Uri Guttman [mailto:uri@sysarch.com] +> Sent: Friday, January 08, 1999 09:59 +> To: lr@hpl.hp.com +> Cc: jdporter@min.net +> Subject: Re: sort article outline +> +... +> LR> make sure to include a note that all this requires that 'no +locale;' be +> LR> in effect, or the idea falls apart. This is from perllocale: +> +> LR> +> LR> no locale; +> LR> print +(sort grep /\w/, map { chr() } 0..255), "\n"; +> +> LR> This machine-native collation (which is what you get unless use +locale +> LR> has appeared earlier in the same block) must be used for sorting +raw +> LR> binary data, whereas the locale-dependent collation of the first +example +> LR> is useful for natural text. +> LR> +> +> LR> Note the 'sorting raw binary data' which is what we are talking +about. +> +> is this due to signed/unsigned cmp issues? that 0 .. 255 might map to +> 128 ..255, 0 .. 127 on some machines because of signed char compares? +> this is another porting issue for packed sorts. not just endianess. + +No. Without looking at the perl sources, I venture to guess that it is +because the 'no locale;' default sort uses 'memcmp' while the 'use +locale;' default sort uses 'strcoll' which relies on LC_COLLATE. There +might be a signedness problem if 'strncmp' were used instead of +'memcmp'. But I recall John saying that 'memcmp' was used (and it would +*have* to be to conform to Perl's definition of a string, which is a +sequence of arbitrary bytes whose length is specified by a separate +integer, vs. C's definition of a string, which is a sequence of bytes +terminated by a zero byte). In any case, I have yet to see any machine +dependencies in my tests on PA-RISC or Intel architectures. Does anyone +know of any target that uses signed-char compares that this could be +verified against -- or just look into the perl source (which I haven't +done yet) to make sure it uses 'memcmp'. + +> printable sorts (hey i like these terms) need to worry about +> locale tho. + +Well, that is interesting. You distinguish between a 'packed' sort key +(which *surely* demands 'no locale;') and a 'printable' ('stringized') +sort key (which perhaps relies on 'use locale;'). Maybe that is a real +distinction that should be understood and documented. A mixed sort key +(packed number and a string, for example) would have to be sorted +without locale, though the number could be stringized by 'sprintf +"%10u"' in which case locale could be observed. + +Larry + + + +From: Larry Rosler +To: "'Uri Guttman'" +Cc: "'jdporter@min.net'" +Subject: RE: sort article outline +Date: Fri, 8 Jan 1999 12:59:24 -0800 +Content-Type: text/plain +Content-Length: 1266 + +... +> No. Without looking at the perl sources, I venture to guess +> that it is because the 'no locale;' default sort uses +> 'memcmp' while the 'use locale;' default sort uses 'strcoll' +> which relies on LC_COLLATE. There might be a signedness +> problem if 'strncmp' were used instead of 'memcmp'. But I +> recall John saying that 'memcmp' was used (and it would +> *have* to be to conform to Perl's definition of a string, +> which is a sequence of arbitrary bytes whose length is +> specified by a separate integer, vs. C's definition of a +> string, which is a sequence of bytes terminated by a zero +> byte). In any case, I have yet to see any machine +> dependencies in my tests on PA-RISC or Intel architectures. +> Does anyone know of any target that uses signed-char compares +> that this could be verified against -- or just look into the +> perl source (which I haven't done yet) to make sure it uses 'memcmp'. + +>From 'sv.c', the only comparison routine used is 'memcmp'. When locale +is in effect, each comparand is passed once through strxfrm and the +transformed value is saved for sort comparisons. (The mechanism must be +like that which converts numbers to strings or v.v. once only.) + +So I think character-signedness is a non-issue. + +Larry + + + +From: Larry Rosler +To: "'John Porter'" , uri@sysarch.com +Subject: RE: sort article outline +Date: Fri, 8 Jan 1999 14:29:13 -0800 +Content-Type: text/plain +Content-Length: 2430 + +> From: John Porter [mailto:jdporter@min.net] +> Sent: Friday, January 08, 1999 11:03 +> To: uri@sysarch.com +> Cc: lr@hpl.hp.com +> Subject: Re: sort article outline +> +> Uri wrote: +> > JP> But it's also long-winded. Can't we simply talk about the +> > JP> "default sort" (locale issues aside)? +> > +> > i don't like that because it doesn't describe what transforms are +being +> > done to get it to use the builtin cmp. just oo vague a term. +> +> Same applies to "default lexicographic sort". It's not really for +> describing any of the transforms we're investigation, but for the +> underlying sort (qsort/cmp) which we're exploiting. +> +> > what is important is the method of transforms that +> > allows you to use the builtin cmp. so i feel the name +> should reflect that. +> +> Right. But everything needs a name, even the "default sort". + +Here, in journalistic format, is a synopsis of where we stand. You may +prefer 'Advanced' to 'Enhanced'. + +WHAT: The Enhanced Default Sort + +WHY: For optimal performance on large sorts (often with several sort + fields), compare sort keys using built-in C functions rather than + using Perl code in a sort subroutine. + +WHERE: Wherever the sort is based on comparing a single-valued mapping + function of each value to be sorted. (This is true for all + enhanced sorts -- ST, Orcish, ...) + +HOW: For each value to be sorted, create a sort key that is a Perl + string (a sequence of bytes) corresponding lexicographically to +the + position of the value in the sorted output; append the value to +the + sort key so that it can readily be retrieved. The sort key can +be + created using any combination of 'pack', 'sprintf' or +concatenation. + Each field to be sorted may comprise a signed or unsigned +integer, + a floating-point number, or a string of characters. (Special + handling is required if the values to be sorted are references -- + LoL or LoH.) + +WHEN: Compute the keys in a single pass through the list of values +(map). + Sort. Retrieve the values from the keys (map). + +Other minor issues: As stated, this method sorts two values whose keys +compare equal according to the values themselves (which is the usual +fall-through condition for a fielded sort). A 'stable' sort can be +forced (using any sorting method) by adding a sequential index as the +last sort key. + +GO NINERS!!! (Bye-bye, Pats :-() + +Larry + + + +i have to write these down now or i may forget them. + +a sort module api could look like this: + +$sort_obj = Sort->new( [ sort keys desriptions ], gloabl sort options + +sort key description => { + + 'type' => integer, string, float, etc. + + 'down' => yes + + 'code_to_get_key_value' => code ref OR code text. + +} + +$sort_obj->values( list of values or refs) + +can be called again to add more to list. + +$sort_obj->sort returns sorted data + + +if the option is to build a preprocess sub, we take the code texts and +create the text for it. then it will run fast, an optimization for large +sorts. otherwise, we use the code ref which takes a ref (and key +number?) for an argument and we call it to return the nth key value. + +from this info we create a pack format string. + +any inversions or endian things are handled here in the preprocess code +(either version) + +for each sort value we build up an array of key values. then we just +do a pack and push its value. + +we also generate an unpack format (not same as pack format but related) +to get the index and/or the original values to return. + +the pack formats, refs to processing code and other stuff is stored in +the sort object for reuse. + +we can even have special case packs for common things like ip addresses +so we can optimize even further there. ip can be designated as text or +binary. + +well, what do you think? + +uri diff --git a/paper/rand_data.pl b/paper/rand_data.pl new file mode 100644 index 0000000..cf5b0cd --- /dev/null +++ b/paper/rand_data.pl @@ -0,0 +1,79 @@ + + +sub rand_picks { + + my( $count, $pattern ) = @_ ; + + my( @rand, @chars, $char, $pick, @digits, @alphas ) ; + + @chars = split( '', $pattern ) ; + + @digits = ( '0' .. '9' ) ; + @alphas = ( 'a' .. 'z', 'A' .. 'Z' ) ; + + while( $count-- ) { + + $pick = '' ; + + foreach $char ( @chars ) { + + if ( $char eq 'a' ) { + + $pick .= $alphas[ rand 52 ] ; + next ; + } + + if ( $char eq '0' ) { + + $pick .= $digits[ rand 10 ] ; + next ; + } + } + + push( @rand, $pick ) ; + } + + @rand ; +} + +sub rand_deal { + + my( $count, @deck ) = @_ ; + + my( @rand ) ; + + push( @rand, $deck[ rand( @deck ) ] ) while $count-- ; + + @rand ; +} + + + +sub rand_ints { + + my( $count, $base, $range ) = @_ ; + + my( @rand ) ; + + push( @rand, int( rand( $range ) + $base ) ) while $count-- ; + + @rand ; +} + + + +sub rand_padded_ints { + + my( $count, $base, $range ) = @_ ; + + my( @rand, $pad ) ; + + $pad = length( $base + $range ) ; + + push( @rand, sprintf( "%0${pad}d", int( rand( $range ) + $base ) ) ) + while $count-- ; + + @rand ; +} + +1 ; diff --git a/paper/sort_bench.pl b/paper/sort_bench.pl new file mode 100644 index 0000000..bede2c0 --- /dev/null +++ b/paper/sort_bench.pl @@ -0,0 +1,72 @@ +#!/usr/local/bin/perl + +use Benchmark; + +@int_10 = rand_padded_ints( 10, 4, 8 ) ; +@int_100 = rand_padded_ints( 100, 4, 8 ) ; + +print "@int_10\n" ; +print "@int_100\n" ; + + +#bench_sorts( 1 << 10, [qw(cmp)], [qw( int_10 int_100 )] ) ; + +bench_sorts( 1 << 10, [qw(cmp default)], [qw( int_100 )] ) ; + +exit( 0 ) ; + +sub bench_sorts { + + my( $count, $sorts_ref, $data_names_ref ) = @_ ; + + my( $sort, $data_name, $bench_text ) ; + + + $bench_text = < sub { null() }, +BENCH + + foreach $sort ( 'null', @{$sorts_ref} ) { + + foreach $data_name ( @{$data_names_ref} ) { + + $bench_text .= < sub { my( \@sorted ) = sort_$sort( \@$data_name ) }, +BENCH + } + } + + $bench_text .= < $b } @_ ; +} diff --git a/paper/sort_gen.pl b/paper/sort_gen.pl new file mode 100644 index 0000000..d141157 --- /dev/null +++ b/paper/sort_gen.pl @@ -0,0 +1,58 @@ +#!/usr/local/bin/perl + + +@sort_hash_desc = ( + + { + 'access' => [ 'Hash', 'foo' ], + 'type' => 'int', + 'width' => 2, + }, + { + 'access' => [ 'HoH', 'key1', 'key2' ], + 'type' => 'int', + 'width' => 3, + }, + { + 'access' => [ 'HoL', 'list', '3' ], + 'type' => 'int', + 'width' => 4, + }, +) ; + +@sort_list_desc = ( + + { + 'access' => [ 'List', 2 ], + 'type' => 'int', + 'width' => 8, + }, + { + 'access' => [ 'LoH', '0', 'key2' ], + 'type' => 'int', + 'width' => 6, + }, + { + 'access' => [ 'LoL', '1', '2' ], + 'type' => 'int', + 'width' => 4, + }, +) ; + +@sort_str_desc = ( + + { + 'access' => [ 'Split', ':' ], + 'slice' => { + 3 => { + 'type' => 'text', + 'width' => 10, + }, + 5 => { + 'type' => 'int', + 'width' => 4, + } + }, + }, +) ; + diff --git a/paper/sort_outline b/paper/sort_outline new file mode 100644 index 0000000..30f55dc --- /dev/null +++ b/paper/sort_outline @@ -0,0 +1,207 @@ + Outline for TPJ Sort Article + +I. Overview of sorting + + A. What is sorting + + B. Why is it needed + + C. Sort theory + + 1. O(n) order function + + 2. Constant and linear overhead for initialization (and post + processing) + + 3. Constant overhead per compare + + 4. Why reducing compare count is critical + + 5. Why amount of sort data matters + + D. Common sort algorithms + + 1. bubble O(n**2) + 2. insertion O(n**2) + 3. tree O(n log n) + 4. shell O(n log n) + 5. quicksort O(n log n) + + E. Other information + + 1. Knuth on sorting (Vol. 3, Chap. 5) + + 2. perlfaq5: "How do I sort an array by (anthing)?" + + 3. Any good textbook on sorting and/or algorithms + +II. Perl's sort function + + A. Internally uses C qsort + + B. Builtin compare is cmp + + 1. C's memcmp for chars + 2. something larry mentioned for locale + + C. Custom compare code + + 1. block of code + + 2. sub name + + 3. sub type glob + + 4. sub ref (to be done) + + C. Special vars $a and $b in compare + +III Basic Ways to sort in Perl + + A. Default ascending lexographic order + + B. Trivial sorts with callback compare code + + 1. Numeric sort + + { $a <=> $b } + + 2. Reverse sort + + { $b cmp $a } OR reverse sort @unsorted (depending on input size) + + 3. Hash value sort + + { $hash{$a} cmp $hash{$b} } + +larry said this about hash value: (please clafiry) +This is a derived-key sort. + + 4. Array index sort + + { $array[$a] cmp $array[$b] } + + C. Basic multi-field sorts + + 1. Multi-field compares + + { $a->[0] cmp $b->[0] || + $a->[1] cmp $b->[1] } + + 2. Multi-field compares with processing (bad) + +larry comments: (please clarify) +No single-valued sort key per datum. Beyond the scope of +this article. + + +IV. Efficient sorting in Perl + + A. When an investment in improving efficiency is not warranted + + 1. Short data sets + + 2. Rarely executed program + + 3. Low cost in CPU time compared to other parts of the program + + B. Sort overhead + + 1. builtin vs. {$a cmp $b} + + 2. complex compare code + + C. General ideas on how to speed up sort by compare reduction + + 1. Reducing sort compare to default or trivial compare + + 2. Pre- and Postprocessing is O(n) vs. complex compare's O(n log n) + + 3. Most these methods are isomorphic as they are O(n) processing + wrapped around O(n log n) sorting + + D. Reduction to trivial sorts + + 1. Schwartzian transform (from Randal Schwartz) + + a. Converts input to LOL (original data/ref is one element of list) + + b. array index sort (trival sort 4) + + c. convert LOL to output {extract saved data/ref) + + d. uses maps or temp arrays + + e. examples and benchmarks (IP addresses) + + 2. Orcish (Or-cache) manoever (from Joseph Hall) + + a. Converts sort data in compare code + + b. Converts only once per unique sort data + + c. Stores converted keys in private hash + + d. hash value sort (trival sort 3) + + e. No need for post processing, as sort is on real data + + f. Can be converted use pre- and post processing (unknown + benchmark) + + g. Requires a single sort key (to hash to converted key) + (single key can be a ref since that will hash uniquely) + + h. examples and benchmarks (IP addresses) + + E. Reduction to default sorts + + 1. Most efficient since no perl code is called during compare + + 2. Can sort multiple keys, string or integers, IEEE floats, + ascending or descending (per key). + + 3. Preprocessing of input data to sortable strings + + a. uses pack or sprintf (or other perl code line . and join) to + create sort key (in a map or foreach/push) + + b. Converted key can be a combination of binary and ascii + + c. Original data is appended onto created sort key + + 4. Extraction of original data + + d. Use substr or unpack (in a map or foreach/push) + + 5. Can't sort refs directly, but an array index sort works around + that: + + Store the refs in an array. store the array index for each ref + as the original data in the string. + + 6. Printed sort + + a. primarily uses sprintf to create printable keys + + b. fully portable as is + + c. need to blank/zero pad integers to keep created keys same size + + d. reverse sort order via ??? (larry, clarify your + idea here) (also can floats be reversed?) + + e. examples and benchmarks (IP addresses) + + 7. Packed sort + + a. primarily uses pack to create binary keys + + b. needs to handle porting issues (endian, IEEE floats) + + c. no padding needed for binary key parts + + d. reverse sort order via ??? (larry, clarify your + idea here) (also can floats be reversed?) + + e. examples and benchmarks (IP addresses) diff --git a/paper/sort_paper.html b/paper/sort_paper.html new file mode 100644 index 0000000..7837ff0 --- /dev/null +++ b/paper/sort_paper.html @@ -0,0 +1,875 @@ + + + + +A Fresh Look at Efficient Perl Sorting + + + +

A Fresh Look at Efficient Perl Sorting

+

Uri Guttman and Larry Rosler

+

Uri Guttman is an independent Perl and Internet consultant; uri@sysarch.com
+Larry Rosler is at Hewlett-Packard Laboratories, Palo Alto, CA; lr@hpl.hp.com

+

Abstract

+

Sorting can be a major bottleneck in Perl programs. Performance +can vary by orders of magnitude, depending on how the sort is +written. In this paper, we examine Perl´s sort function in +depth and describe how to use it with simple and complex data. Next we +analyze and compare several well-known Perl sorting optimizations +(including the Orcish Maneuver and the Schwartzian Transform). We then +show how to improve their performance significantly, by packing multiple +sortkeys into a single string. Finally, we present a fresh approach, +using the sort function with packed sortkeys and without a +sortsub. This provides much better performance than any of the other +methods, and is easy to implement directly or by using a new module we +created, Sort::Records.

+ +

NOTE: Sort::Records died during development but five years later, +Sort::Maker was released and does all that was promised and more. Find +it on CPAN +

+ +

What is sorting and why do we use it?

+

Sorting is the rearrangement of a list into an order defined by a monotonically increasing or decreasing sequence of sortkeys, where each sortkey is a single-valued function of the corresponding element of the list. (We will use the term sortkeys to avoid confusion with the keys of a hash.)

+

Sorting is used to reorder a list into a sequence suitable for further processing or searching. In many cases the sorted output is intended for people to read; sorting makes it much easier to understand the data and to find a desired datum.

+

Sorting is used in many types of programs and on all kinds of data. It is such a common, resource-consuming operation that sorting algorithms and the creation of optimal implementations comprise an important branch of computer science.

+

This paper is about creating optimal sorts using Perl. We start with a brief overview of sorting, including basic algorithm theory and notation, some well-known sorting algorithms and their efficiencies, sortkey processing, and sorting outside of Perl. Next we will describe Perl´s sort function [1] and basic ways to use it. Then we cover handling complex sortkeys, which raises the question of how to optimize their processing. Finally we introduce a relatively new method, which moves all the sortkey processing out of the sort function, and which produces the most efficient Perl sort. A new module is also described, which implements this sorting technique and which has powerful support for sortkey extraction (the processing of the input data to produce the sortkeys.

+

Algorithm and sorting theory

+

A complete discussion of algorithm and sorting theory is beyond the scope of this paper. This section will cover just enough theory and terminology to explain the methods that we use to compare sort techniques.

+

The complexity of an algorithm is a measure of the resources needed to execute the algorithm -- typically there is a critical operation that needs to be executed many times. Part of algorithm theory is figuring out which operation is the limiting factor, and then formulating a function that describes the number of times the operation is executed. This complexity function is commonly written with the big-O notation -- O(f(N)) -- where `O´ is read as `order of´ and `f(N)´ is some function of N, the size of the input data set.

+

O(f(N)) comparisons have some unusual properties. The actual size of N is usually irrelevant to the correct execution of an algorithm, but its influence on the behavior of f(N) is critical. If an algorithm´s order is O(N*logN + N), when N is large enough the effect of the N on the function´s value is negligible compared to the N*logN expression. So that algorithm´s order is just O(N*logN). In many cases the calculated order function for an algorithm is a polynomial of N, but you see only the term with the highest power, and no coefficient is shown. Similarly, if two algorithms have the same order but one does more work for each operation, they are still equivalent in order space, even though there may be a substantial difference in real-world speeds. That last point is crucial in the techniques we will show to optimize Perl sorts, all of which have the same big-O function, O(N*logN).

+

Here are some well-known algorithms and their order functions (adapted from [2]):

+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Notation

+

Name

+

Example

+

O(1)

+

constant

+

array or hash index

+

O(logN)

+

logarithmic

+

binary search

+

O(N)

+

linear

+

string comparison

+

O(N*logN)

+

n log n

+

advanced sort

+

O(N**2)

+

quadratic

+

simple sort

+

O(N**3)

+

cubic

+

matrix multiplication

+

O(2**N)

+

exponential

+

set partitioning

+

+ + +

Sorting´s critical operation is determining in which order to put pairs of elements of the data. The comparison can be as simple as finding whether two numbers are equal or which is greater than the other (or doing similar operations on strings), or it can be quite complex.

+

Simple sorting algorithms (bubble or insertion sorts) compare each element to each of the others repeatedly, so their complexity is O(N**N). Even with the triangle optimization ($x is equal to $x, and $x compared to $y is the negative of $y compared to $x), which reduces the function to O((N * (N-1))/2), the complexity is still O(N**N), as explained above.

+

But these algorithms have their uses. When N is very small, they can actually be faster than the other methods, because the O(1) and O(N) overhead of the advanced sorts may outweigh the O(N**2) behavior of the simple sorts. "Fancy algorithms are slow when N is small, and N is usually small. Fancy algorithms have big constants." [3] The really important cases, which are worth care in the coding, occur when N is large.

+

Advanced sorting methods repeatedly partition the records to be sorted into smaller sets, to reduce the total number of comparisons needed. Their complexity is O(N*logN), which can be much less than O(N**2) for sufficiently large values of N. These algorithms include `tree sort´, `shell sort´, and `quicksort´. [4]

+

Some specialized sort algorithms (such as `radix sort´) work by comparing pieces of numeric sortkeys, and can achieve linear complexity (O(N)) [5]. These methods are not general-purpose, so we will not address them further.

+

One property of sort algorithms is whether they are stable. A stable sort preserves the order in the sorted data of two elements that compare equal. Some sorting problems require stability. The simple sorting algorithms are generally stable; the advanced ones are not. We will show how to make Perl´s advanced sort behave stably if required.

+

An important sorting variation is when the original data elements can´t conveniently be moved around by the sort algorithm´s shuffling. So instead of sorting the elements directly, you sort their index numbers. You then use the sorted indexes to create a list of sorted elements. Some sort operators in other languages (APL comes to mind) simply return sorted indexes, and it is up to the programmer to use them correctly. We will show how to create an efficient Perl index sort and where it is useful.

+

Sortkeys

+

If you are sorting a set of scalar-valued elements where the comparison looks at the entire element, the sortkey is simply the entire element. More generally, the sortkey is based on some properties that are functions of all or part of the element. Such subkeys may be extracted from internal properties of parts of the element (fields) or derived from external properties of the element (such as the modification date of a file named by the element, which is quite expensive to retrieve from the file system).

+

To avoid repeated computation of the sortkeys, the sort process has to retain the association between records and their extracted or derived sortkeys. Sorting theory and algorithms usually ignore the cost of this association, as it is typically a constant factor of the comparison operation. But as we will see later, in the real world, removing that overhead or reducing it from O(N*logN) to O(N) is very valuable, especially as N grows.

+

Complex sortkeys can add tremendously to the overhead of each comparison. This occurs where the records have to be sorted by primary, secondary, and lower-order subkeys. This is also known as doing a subsort on the lower keys. Extracting and comparing complex sortkeys can be costly and error-prone.

+

No general-purpose implementation of a sort algorithm can efficiently support extracting and comparing different types of sortkeys. Therefore, most sort implementations provide a simple interface to call a sortsub -- a custom comparison subroutine which is passed two operands. These operands can be the records themselves, or references to or indexes of complex records. The comparison returns a negative, zero, or positive value, depending on the ordering of the sortkeys of the two records. The programmer is responsible for any preprocessing of the records to generate the sortkeys and any postprocessing to retrieve the sorted data. The generic sort function only manages the comparisons and shuffles the operands into sorted order.

+

As Perl´s sort function is O(N*logN), efficiency must come from extracting and comparing the sortkeys using the least amount of work. Much of this paper will be about methods to make sortkey extraction and comparison as efficient as possible.

+

External sorting

+

Every popular commercial operating system offers a sort utility. Unix/POSIX flavors typically have a sort command which is fast and fairly flexible with regard to sortkey extraction from text files. In some cases, the Unix/POSIX sort command may be easier to code and more efficient than using the Perl sort function.

+

Several vendors sell highly optimized commercial sort packages that have received decades of attention and can handle massive amounts of data. But they are very expensive and not suitable for use inside a Perl program.

+

All of these are capable of dealing efficiently with very large amounts of data, using external media such as disk or tape files for intermediate storage when needed. In contrast, the Perl sort function requires that the entire list of operands be in (real or -- much more expensively -- virtual) memory at the same time. So Perl is not the appropriate tool to use for huge sorts (where huge is defined by your system´s memory limits), which we shall not consider further.

+

Internal sorting

+

The Perl sort function uses an implementation of the quicksort algorithm that is similar to (but more robust than) the qsort function in the ANSI/ISO Standard C Library [6]. In the simplest use, the Perl sort function requires no sortsub:

+
@out = sort @in;
+

This default sorts the data in ascending lexicographic order, using the fast C memcmp function as the comparison operation. If a locale is specified, it substitutes the more complicated and somewhat slower C strcoll function.

+

If you want any kind of ordering other than this, you must provide a custom comparison sortsub. The sortsub can be specified either as a code block, the name of a subroutine, or a typeglob that refers to a subroutine (a coderef). In Perl 5.6, a scalar variable that contains a coderef can also be used to specify the sortsub.

+

In order to optimize the calling of the sortsub, Perl bypasses the usual passing of arguments via @_, using instead a more efficient special-purpose method. Within the sortsub, the special global package variables $a and $b are aliases for the two operands being compared. The sortsub must return a number less than 0, equal to 0, or greater than 0, depending on the result of comparing the sortkeys of $a and $b. The special variables $a and $b should never be used to change the values of any input data, as this may break the sort algorithm.

+

Even the simplest custom sort in Perl will be less efficient than using the default comparison. The default sort runs entirely in C code in the perl core, but any sortsub must execute Perl code. A well-known optimization is to minimize the amount of Perl code executing and to try to stay inside the perl core as much as possible. Later we will see various optimization techniques that will reduce the amount of Perl code executed.

+

The primary goal of this paper is to perform all sorts using the default comparison. Here is how an explicit ascending lexicographic would be done using a sortsub:

+
@out = sort { $a cmp $b } @in;
+

For a simple measurement, compare Default and Explicit in Benchmark A1 of Appendix A. The default method is about twice as fast as the explicit method.

+

Trivial sorts

+

We call trivial sorts those that use the entire record as the sortkey and do only a minimal amount of processing of the record. To do trivial Perl sorts other than ascending lexicographic, you just need to create an appropriate sortsub. Here are some common ones that perform useful functions.

+

The simplest such example is the ascending numeric sort, which uses the picturesquely monikered `spaceship´ operator:

+
@out = sort { $a <=> $b } @in;
+

A numeric sort capability is required because the lexicographic order of, say, (1, 2, 10) does not correspond to the numeric order.

+

If you want the sort to be in descending order there are three techniques you can use. The worst is to negate the result of the comparison in the sortsub. Better is to reverse the order of the comparison by swapping $a and $b. This has the same speed as the corresponding forward sort.

+
# descending numeric
+@out = sort { $b <=> $a } @in; +# descending lexicographic
+@out = sort { $b cmp $a } @in; +

The best method is to apply the reverse function to the result of a default ascending lexicographic sort.

+
@out = reverse sort @in;
+

Note that this is faster than using the explicit descending lexicographic sort, for the reason discussed above: the default sort is faster than using a sortsub. The reverse function is efficient because it just moves pointers around.

+

Another common problem is sorting with case insensitivity. This is easily solved using the lc or uc function. Either one will give the same results.

+
@out = sort { lc $a cmp lc $b } @in;
+

Benchmark A1 analyzes these examples as a function of the input size. The O(N*logN) behavior is apparent, as well as the cost of using even a simple built-in function like lc in the sortsub.

+

Fielded and record sorts

+

The above trivial sorts sort the input list using as the sortkey the entire string (for a lexicographic sort) or the first number in each datum (for a numeric sort). More typically, the sortkey is based on some property that is a function of all or part of each datum. Several individual subkeys may be combined into a single sortkey or may be compared in pairs individually.

+

A complex string may be divided into fields, some of which may serve as subkeys. For example, the Unix/POSIX sort command provides built-in support for collation based on one or more fields of the input; the Perl sort function does not, and the programmer must provide it. One CPAN module focuses on fielded sorts [7].

+

If your data are records which are complex strings or references to arrays or hashes, you have to perform comparisons on selected parts of the records. This is called record sorting. (Fielded sorts are a subset of record sorts.)

+

In the code examples that follow, KEY() is meant to be substituted with some Perl code that performs sortkey extraction. It is best that it not be an actual subroutine call, because subroutine calls within sortsubs can be expensive. Calls to built-in Perl functions (such as the calls to lc in the example above) are like Perl operators, thus relatively less expensive.

+

When sorting string records, $a and $b are set to those strings, so to extract the sortkeys you generally perform various string operations on the records. Functions commonly used for this include split, substr, unpack, and m//. Here is one example, sorting a list of password-file lines by user name using split. The fields are separated by colons, and the user name is the first field.

+
@out = sort {
+ (split ':', $a, 2)[0] cmp
+ (split ':', $b, 2)[0]
+ } @pw_lines; +

Multi-subkey sorts

+

In some cases you need to sort records by a primary subkey, then for all the records with the same primary subkey value, you need to sort by a secondary subkey. One horribly inefficient way to do this is to sort first by the primary subkey, then get all the records with a given subkey and sort them by the secondary subkey. The standard method is to do a multi-key sort. This entails extracting a subkey for each field, and comparing paired subkeys in priority order. So if two records with the same primary subkey are compared, they will actually be compared based on the secondary subkey. Sorting on more than two subkeys is done by extending the logic.

+

Perl has a very nice feature which makes multi-key sorts easy to write. The || (short-circuit or) operator returns the actual value of the first logically true operand it sees. So if you use || to concatenate a set of key comparisons, the first comparison is the primary subkey. If a pair of primary subkeys compare equal, the sortsub´s return value will be the result of the secondary subkey comparison.

+

An example will illustrate this `ladder´ of comparisons better than more text. Here is a three-subkey sort:

+
@out = sort {
+ # primary subkeys comparison
+ KEY1($a) cmp KEY1($b)
+ ||
+ # or if they are equal
+ # return secondary comparison
+ # descending numeric comparison
+ KEY2($b) <=> KEY2($a)
+ ||
+ # or if they are equal
+ # return tertiary comparison
+ # lexicographic comparison
+ KEY3($a) cmp KEY3($b)
+} @in; +

Naive multi-subkey record sorts

+

In the two previous examples, we showed a sort with relatively expensive sortkey extraction (via split), and a multi-subkey sort. Let´s combine them. For concreteness, we shall deal with a problem that has received much attention in comp.lang.perl.misc -- sorting a list of IP addresses in `dotted-quad´ form. Each element of the list is a string of the form "nnn.nnn.nnn.nnn\tabc.xyz.com\n", where nnn represents a decimal integer between 0 and 255, with or without leading zero-padding.

+

In the most naive approach, we sort on each of these four numeric fields as individual subkeys, in succession.

+
@out = sort {
+ my @a = $a =~
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
+ my @b = $b =~
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/;
+ $a[0] <=> $b[0] ||
+ $a[1] <=> $b[1] ||
+ $a[2] <=> $b[2] ||
+ $a[3] <=> $b[3]
+ } @in; +

Even for small lists this is very slow, because of the many Perl operations executed in the sortsub for every one of the O(N*logN) comparisons.

+

Computing a single packed-string sortkey

+

To improve performance, we will derive from these four subkeys a single packed-string sortkey for each IP address, which we can then use to sort the array monotonically increasing.

+

The following expression produces the shortest key, a string of four bytes, with the least Perl calculation:

+
pack 'C4' => $string =~
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/ +

This uses the fancy comma operator, =>, which you can read as `applied to´. We then sort these sortkeys lexicographically.

+

The following, then, is the next approach toward achieving an efficient sort:

+
@out = sort {
+ pack('C4' => $a =~
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/)
+ cmp
+ pack('C4' => $b =~
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/)
+ } @in; +

Benchmark A2 shows that comparing the subkeys in pairs is less efficient than packing them and comparing the packed strings. This observation applies to all sorting methods. In further benchmarks of advanced sorts for this problem, we will always used packed sortkeys.

+

Nevertheless, naive sorting is still woefully inefficient, because both sortkeys are recomputed every time one input operand is compared against another. What we need now is a way to compute each sortkey once only and to remember the result.

+

Advanced sorts

+

As all sorts in Perl use the builtin sort function and therefore the same quicksort algorithm, all Perl sorts are of order O(N*logN). We can´t improve upon that, so we have to address other issues to gain efficiency. As the complexity is fixed, tackling the constant factors can be fruitful and, in the real world, can produce significant improvements in efficiency. When a sortsub needs to generate a complex sortkey, that is normally done O(N*LogN) times, but there are only N records, hence N sortkeys. What if we were to extract the sortkey only once per record, and keep track of which sortkey belonged to which record?

+

Caching the sortkeys

+

The obvious way to associate sortkeys with the records from which they were created is to use a hash. The hash can be created in a preprocessing pass over the data. If the approximate size of the data set is known, preallocating the hash improves performance.

+
keys my %cache = @in;
+$cache{$_} = KEY($_) for @in; +

The following sets up the cache more efficiently, using a hash slice:

+
keys my %cache = @in;
+@cache{@in} = map KEY($_) => @in; +

Then the sortsub simply sorts by the values of the cached sortkeys.

+
@out = sort {
+ $cache{$a} cmp $cache{$b)
+ } @in; +

In essence, we have replaced lengthy computations in the sortsub by speedy (O(1)) hash lookups.

+

If you want to do a complex multi-key comparison, you either have to use a separate cache for each subkey or combine subkeys in a similar way to the packed-sort optimizations we will describe later. Here is an example of the former:

+
keys my %cache1 = @in;
+keys my %cache2 = @in;
+($cache1{$_}, $cache2{$_}) =
+ map { KEY1($_), KEY2($_) } $_
+ for @in;
+@out = sort {
+ $cache1{$a} cmp $cache1{$b) ||
+ $cache2{$b} <=> $cache2{$a} }
+ @in; +

Alternatively, a multi-level cache can be used, which sacrifices speed to save some space:

+
keys my %cache = @in;
+$cache{@in} =
+ map [ KEY0($_), KEY1($_) ]
+ => @in;
+@out = sort {
+ $cache{$a}[0] cmp $cache{$b)[0]
+ ||
+ $cache{$b}[1] <=> $cache{$a}[1]
+ } @in; +

An important point about cached sorts is that no postprocessing is needed to retrieve the sorted records. The method sorts the actual records, but uses the cache to reduce the sortkey extraction to O(N).

+

The Orcish Maneuver (OM)

+

The Orcish Maneuver (invented by Joseph N. Hall [8]) eliminates the preprocessing pass over the data, which might save keeping a copy of the data if they are being read directly from a file. It does the sortkey extraction only once per record, as it checks the hash to see if it was done before. The test and storage of the sortkey is done with the ||= operator (short-circuit or-assignment), which will evaluate and assign the expression on the right to the lvalue on the left, if the lvalue is false. The name `orcish´ is a pun on `or-cache´. The full statement in the sortsub looks like this:

+
keys my %or_cache = @in;
+@out = sort {
+ ($or_cache{$a} ||= KEY($a))
+ cmp
+ ($or_cache{$b} ||= KEY($b))
+ } @in; +

That sees if the sortkey for $a is cached, and if not, extracts it and caches it. The sortkey for $a is then compared to the sortkey for $b (which is found in the same way).

+

Here is an example of a two-subkey comparison using two caches:

+
keys my %or_cache1 = @in;
+keys my %or_cache2 = @in;
+@out = sort {
+ ($or_cache1{$a} ||= KEY1($a))
+ cmp
+ ($or_cache1{$b} ||= KEY1($b))
+ ||
+ ($or_cache2{$b} ||= KEY2($b))
+ <=>
+ ($or_cache2{$a} ||= KEY2($a))
+ } @in; +

The OM has some minor efficiency flaws. An extra test is necessary after each sortkey is retrieved from the or-cache. Furthermore, if an extracted sortkey has a false value, it will be recomputed every time. This usually works out all right, because the extracted sortkeys are seldom false. However, except when the need to avoid reading the data twice is critical, the explicit cached sort is always slightly faster than the OM. (See Benchmark A3.)

+

The Schwartzian Transform (ST)

+

A more efficient approach to caching sortkeys, without using named temporary variables, was popularized by Randal L. Schwartz, and dubbed the Schwartzian Transform [9, 10]. (It should really be called the Schwartz Transform, after the model of the Fourier and Laplace Transforms, but it is too late to fix the name now.)

+

The significant invention in the ST is the use of anonymous arrays to store the records and their sortkeys. The sortkeys are extracted once, during a preprocessing pass over all the data in the list to be sorted (just as we did before in computing the cache of sortkeys).

+
@out =
+ map $_->[0] =>
+ sort { $a->[1] cmp $b->[1] }
+ map [ $_, KEY($_) ] =>
+ @in; +

The ST doesn´t sort the actual input data. It sorts the references to anonymous arrays that contain the original records and the sortkeys. So we have to postprocess to retrieve the sorted records from the anonymous arrays.

+

Using the ST for a multi-subkey sort is straightforward. Just store each successive extracted subkey in the next entry in the anonymous array. In the sortsub, do an or between comparisons of successive subkeys, as with the OM and the naive sorts.

+
@out =
+ map $_->[0] =>
+ sort { $a->[1] cmp $b->[1] ||
+ $b->[2] <=> $a->[2] }
+ map [ $_, KEY1($_), KEY2($_) ]
+ => @in; +

For a very illuminating deconstruction and reconstruction of the ST, see [11].

+

The packed-default sort

+

Each of the advanced sorting techniques described above saves the operands to be sorted together with their sortkeys. (In the cached sorts, the operands are the keys of a hash and the sortkeys are the values of the hash; in the Schwartzian Transform, the operands are the first elements of anonymous arrays, the sortkeys are the other elements of the arrays.) We now extend that idea to saving the operands to be sorted together with packed-string sortkeys, using concatenation.

+

This little-known optimization improves on the ST by eliminating the sortsub itself, relying on the default lexicographic sort, which as we showed earlier is very efficient. This is the method used in the new Sort::Maker module.

+

To accomplish this goal, we modify the ST by replacing its anonymous arrays by packed strings. First we pack into a single string each subkey followed last by the operand to be sorted. Then we sort lexicographically on those strings, and finally we retrieve the operands from the end of the strings.

+
@out =
+ map substr($_, 4) =>
+ sort
+ map pack('C4' =>
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/)
+ . $_ => @in; +

Several methods can be used, singly or in combination, to construct the packed strings, including concatenation, pack, or sprintf. Several methods can be used to retrieve the operands, including substr (shown here), which is likely to be the fastest, split, unpack or a regex.

+

Multiple subkeys are simply concatenated, suitably delimited if necessary. Techniques for computing subkeys of various types are presented in Appendix B.

+

Benchmarks of the packed-default sort

+

Benchmark A4 compares the two most advanced general-purpose sorting techniques, ST and packed-default. These multi-stage sorts are measured both as individual stages with saved intermediate data and as single statements.

+

The packed-default sort is about twice as fast as the ST, which is the fastest familiar Perl sorting algorithm.

+

Earlier, we showed a trivial sort using the lc function. Even for that case, the packed-default sort provides better performance when more than a few data items are being sorted. See Benchmark A5, which shows quasi-O(N) behavior for the packed-default sort (because the sorting time is small relative to the sortkey extraction).

+

Sorting a list of arrays or hashes

+

Consider the common problem of sorting a two-dimensional data structure, a list of references to arrays or to hashes, where the sortkeys are functions of the values of the submembers.

+

If we were to use the packed-default method, the references would be converted to strings and appended to the sortkeys. After sorting, the operands could be retrieved as strings, but would no longer be usable as references. Instead, we must use the indexes of the list members as the operands to be sorted.

+

The following benchmark compares a packed-sortkey ST sort with an indexed sort that uses the packed-default approach. The list being sorted comprises references to arrays, each of which has two elements: an IP address (which serves as the primary sortkey), and a domain name (which serves as the secondary sortkey). These are the same data as used in the above benchmarks, split into two array elements.

+
@out =
+ map $_->[0] =>
+ sort { $a->[1] cmp $b->[1] }
+ map [ $_, pack('C4 A*' =>
+ $_->[0] =~
+ /(\d+)\.(\d+)\.(\d+)\.(\d+)/,
+ $_->[1]) ] => @in;
+
+my $i = 0;
+keys my %h = @in;
+@h{ map pack('C4 A* x N' => $_->[0]
+ =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/,
+ $_->[1], $i++) => @in } = @in;
+@out = @h{ sort keys %h }; +

The indexed sort is faster than the ST once again. (See Benchmark A6.)

+

Indexed sorts and stable sorts

+

In the indexed sort, the auto-incrementing index $i ensures that no array records will have identical packed sortkeys. It also ensures that the sort will be stable.

+

Any Perl sort can be stabilized by using such an index as the final tie-breaking subkey. For an indexed sort, the index is actually the operand being sorted. This fact offers another possible performance advantage for the indexed sort. The actual records to be sorted (which may be long strings) need not be appended to the sortkeys, which would create a second copy of each record. Using the indexed sort, the records may be recovered after the sort from the original data, using the sorted indexes.

+

The Sort::Maker module

+

Sort::Maker is on CPAN and implements the GRT for +all types of Perl values.

+

+

Conclusions

+

Packing of subkeys into strings that can be compared lexicographically improves the performance of all sorting techniques, relative to the usual method of comparing the individual subkeys in pairs.

+

Packing the operands with the sortkeys allows the sort to be done using the default ascending lexicographic comparison (without a sortsub). This yields a markedly faster sort than the Orcish Maneuver or the Schwartzian Transform. The sorting process may approximate O(N) behavior, because the O(N*logN) time for the sort itself is small compared to the time required to extract the sortkeys.

+

The packed-sortkey sort may be written explicitly, or the new Sort::Maker module may be used.

+

Acknowledgments

+

This idea was brought to our attention by Michal Rutka [12]. John Porter participated in initiating this project and reviewed a draft of the paper.

+

References

+ +

1. The sort function man page, http://www.perl.com/CPAN/doc/manual/html/pod/perlfunc/sort.html

+

2. Kernighan, B. W. & Pike, R., (1999). The Practice of Programming, p. 41. Addison-Wesley.

+

3. Pike, R. (1989). Notes on Programming in C, http://wwwwbs.cs.tu-berlin.de/~jutta/c/pikestyle.html

+

4. Knuth, D. E. (1998). The Art of Computer Programming : Sorting and Searching (Vol 3, 2nd ed), chap. 5. Addison-Wesley.

+

5. Sedgewick, R. (1983). Algorithms, chap. 10. Addison-Wesley.

+

6. ANSI/ISO 9899-1992, sect. 4.10.5.2. American National Standards Institute.

+

7. Hall, J. N., Sort::Fields -- Sort lines containing delimited fields, http://www.perl.com/CPAN/modules/by-module/Sort/JNH/

+

8. Hall, J. N. (1998). Effective Perl Programming, p. 48. Addison-Wesley.

+

9. How do I sort an array by (anything)?, http://www.perl.com/CPAN/doc/manual/html/pod/perlfaq4.html#How_do_I_sort_an_array_by_anyth

+

10. Christiansen, T. & Torkington, N. (1998). The Perl Cookbook, Recipe 4.15: "Sorting a List by Computable Field". O´Reilly.

+

11. Christiansen, T., Far More Than Everything You´ve Ever Wanted to Know About Sorting, http://www.perl.com/CPAN/doc/FMTEYEWTK/sort.html

+

12. Rutka, M., in comp.lang.perl.misc. http://x4.dejanews.com/[ST_rn=ps]/getdoc.xp?AN=397853353)

+ +

Appendix A: Benchmarks

+

A caveat: Useful benchmarking depends on judicious isolation of relevant variables, both in the algorithms being benchmarked and in the data sets used. Different implementations may give different relative results even with the same algorithms and data. Thus all such results should be verified under your own conditions. In short, your mileage may vary.

+

In the following benchmarks, all data represent the time (in microseconds) per line in the input data, which averages 35 characters per line. All named arrays and hashes are preallocated, which reduces the variance in the measurements due to storage allocation.

+

Benchmark A1. Trivial sorts

+

+ + + + + + + + + + + + + + + +
+

Control

+
@out = @in;
+

Default

+
@out = sort @in;
+

Reverse

+
@out = reverse sort @in;
+

Explicit

+
@out = sort
+ { $a cmp $b } @in;
+

Insensitive

+
@out = sort
+ { lc $a cmp lc $b }
+ @in;
+

+ + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Number of lines:

+

100

+

1000

+

10K

+

100K

+

Control

+

5

+

6

+

7

+

8

+

Default

+

9

+

13

+

19

+

25

+

Reverse

+

9

+

14

+

19

+

26

+

Explicit

+

17

+

25

+

37

+

50

+

Insensitive

+

47

+

62

+

91

+

119

+

+ +

Benchmark A2. Naive sorts (IP addresses)

+

+ + + + + + + + + + + + + + + + + + +
+

Number of lines:

+

100

+

1000

+

10K

+

100K

+

Separate subkeys

+

697

+

1251

+

1732

+

2758

+

Packed sortkeys

+

583

+

1002

+

1363

+

1814

+

+ +

Benchmark A3. Cached sorts (packed sortkeys)

+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Number of lines:

+

100

+

1000

+

10K

+

100K

+

Caching

+

66

+

75

+

85

+

74

+

Sorting

+

49

+

87

+

122

+

164

+

Total cached sort

+

116

+

163

+

215

+

240

+

Orcish Maneuver

+

125

+

168

+

221

+

256

+

+ +

Benchmark A4. Advanced packed-key sorts

+

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Number of lines:

+

100

+

1000

+

10K

+

100K

+

ST

+

 

+

 

+

 

+

 

+

Anon arrays

+

80

+

84

+

84

+

75

+

Sorting

+

27

+

47

+

76

+

97

+

Retrieval

+

13

+

18

+

20

+

17

+

One statement

+

116

+

150

+

177

+

191

+

Packed Default

+

 

+

 

+

 

+

 

+

Packing

+

61

+

63

+

65

+

67

+

Sorting

+

9

+

12

+

18

+

25

+

Retrieval

+

12

+

12

+

13

+

12

+

One statement

+

73

+

79

+

86

+

93

+

+ +

Benchmark A5. Another look at a trivial sort

+

+ + + + + + +
+

Insensitive

+
@out = sort
+ { lc $a cmp lc $b }
+ @in;
+

Packed

+
@out = map substr($_,
+ 1 + rindex $_, "\0")=>
+ sort => map "\L$_\E\0$_"
+ => @in;
+

+ + +

+ + + + + + + + + + + + + + + + + + + + + +
+

Number of lines:

+

10

+

100

+

1000

+

10K

+

100K

+

Insensitive

+

19

+

38

+

62

+

91

+

118

+

Packed

+

22

+

22

+

24

+

25

+

27

+

+ +

Benchmark A6. Two-dimensional packed-sortkey sorts

+

+ + + + + + + + + + + + + + + + + + +
+

Number of lines:

+

100

+

1000

+

10K

+

100K

+

ST

+

243

+

314

+

359

+

435

+

Index

+

200

+

285

+

323

+

259

+

+ +

Appendix B: Explicit packed-default sorts

+

B1. Creating and combining sortable string subkeys

+

This is the preprocessing pass (the first map executed).

+
@sorted = map ... => sort =>
+ map KEY($_) . $_ => @data; +

To create and combine the subkeys and the operand to be sorted, any combination of concatenation, interpolation, pack, or sprintf may be used, the latter two with simple or compound formats.

+

Fixed-length strings (ascending):

+ +

simple interpolation

+
pack('... An ...', ...)
+sprintf('... %s ...', ...)
+ +

Fixed-length strings (descending):

+ +

Bit-complement the string first.

+
$subkey = $string ^
+    "\xFF" x length $string
+

Then handle as an ascending fixed-length string.

+ +

Null bytes ("\0") are used to terminate string subkeys of varying length, as that ensures lexicographic ordering. If a string subkey may contain a null byte, then it must be of fixed length. If any of the operands to be sorted may contain null bytes, then every subkey must have fixed length.

+

Varying-length strings (ascending):

+

Terminate the string with a null byte, to separate it from succeeding subkeys or the operand.

+ +

interpolation: "$string\0"

+

pack('... A* x ...', ...)

+

sprintf('... %s\0 ...', ...)

+ +

Varying-length strings (descending):

+

Make a prepass over the data to find the length of the longest string.

+ +
my $len = 0;
+$len < length and $len = length
+ for map KEY($_) => @data;
+ +

Then null-pad each string to that length and proceed as above for fixed-length strings (descending).

+ +
$subkey = pack("a$len", $string)
+ ^ "\xFF" x $len
+ +

Unsigned 32-bit integers (ascending):

+

Pack or zero-pad to fixed length.

+ +

Preferred -- only 4 bytes:

+
pack('... N ...', ...)
+

Readable -- but 10 bytes:

+
sprintf('... %.10u ...', ...)
+ +

Signed two´s-complement 32-bit integers (ascending):

+

Bias to unsigned by xoring the sign bit, then treat as unsigned.

+ +
$subkey = pack('N',
+ $number ^ (1 << 31));
+ +

Floating-point numbers (ascending):

+

This code assumes that floating-point numbers are represented in binary using IEEE format. Create a subroutine that packs a double in network order (big-endian).

+ +
BEGIN {
+ my $big_endian =
+ pack('N', 1) eq
+ pack('L', 1);
+ sub float_sort ($) {
+ ($big_endian ?
+ pack 'd', $_[0] :
+ reverse pack 'd', $_[0]) ^
+ ($_[0] < 0 ? "\xFF" x 8 :
+ "\x80" . "\x00" x 7)
+ }
+$subkey = float_sort($number);
+ +

Descending integers or floating-point numbers:

+

Negate the value, then use the appropriate one of the above.

+

B2. Extracting the operands from the sorted strings

+

This is the postprocessing pass (the second map executed).

+
@sorted = map RETRIEVE($_) =>
+ sort => map ... => @data; +

If all the subkeys have known length, use the total length:

+ +

Preferred for efficiency:

+
@sorted =
+ map substr($_, $length) =>
+ ... +

TMTOWTDI:

+
@sorted =
+ map unpack("x$length A*",
+ $_) => ...
+ +

If any of the subkeys has varying length, make sure that the last character in the complete packed sortkey is a null byte, then search for it from the right:

+ +

Preferred for efficiency:

+
@sorted = map substr($_,
+ 1 + rindex $_, "\0") => ... +

TMTOWTDI:

+
@sorted =
+ map (split /\0/)[-1] => ...
+@sorted = map /([^\0]+)$/ => ...
+ +

Appendix C: The Sort::Maker module

+

GO TO TOWN, URI! ...

+ + + diff --git a/slides/Makefile b/slides/Makefile new file mode 100644 index 0000000..1cc42fc --- /dev/null +++ b/slides/Makefile @@ -0,0 +1,10 @@ + + +slides/index.html : clean sort_maker.txt + make_slides.pl sort_maker.txt + +ftp : slides/index.html + scp -r * stemsystems.com:www/sort/slides + +clean : + -/bin/rm slides/*.html diff --git a/slides/slides/index.html b/slides/slides/index.html new file mode 100644 index 0000000..9f8b153 --- /dev/null +++ b/slides/slides/index.html @@ -0,0 +1,26 @@ + + + +Index of Sort::Maker + + +

Index of Sort::Maker +

+

+

    +
  • 1. Sort::Maker + + diff --git a/slides/slides/slide-0101.html b/slides/slides/slide-0101.html new file mode 100644 index 0000000..db2e16b --- /dev/null +++ b/slides/slides/slide-0101.html @@ -0,0 +1,59 @@ + + + + A Little Past History + +

    1.1: A Little Past History

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 1/12 +
    +
    +
      +
    • TPC3 in Monterey, CA 1999 + +
    • Co-wrote and presented "A Fresh Look at Efficient Perl Sorting" + + + http://www.sysarch.com/perl/sort_paper.html +
    • Co-author was Larry Rosler (retired from HP and Perl) + +
    • Introduced the GRT as a way to speed up Perl sorts + +
    • It won the prize for Best Technical Paper + +
    • A module Sort::Records was promised which would implement the GRT + +
    • Sort::Records was started but a code and design review showed many + problems. + +
    • Sort::Records was shelved during development because of a code review + +
    • People still write to me asking about that module +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 1/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0102.html b/slides/slides/slide-0102.html new file mode 100644 index 0000000..84d32ca --- /dev/null +++ b/slides/slides/slide-0102.html @@ -0,0 +1,50 @@ + + + + Recent History + +

    1.2: Recent History

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 2/12 +
    +
    +
      +
    • I have been thinking about this module for 5 years + +
    • Solved the major design flaw - have the user provide key extraction + code + +
    • A thread in perl6-language gave needed inspiration + +
    • Sort::Maker emerges + +
    • Much help in the API design and some coding tricks from Damian Conway + +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 2/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0103.html b/slides/slides/slide-0103.html new file mode 100644 index 0000000..327c01f --- /dev/null +++ b/slides/slides/slide-0103.html @@ -0,0 +1,64 @@ + + + + Introducing Sort::Maker + +

    1.3: Introducing Sort::Maker

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 3/12 +
    +
    + +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 3/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0104.html b/slides/slides/slide-0104.html new file mode 100644 index 0000000..320dbeb --- /dev/null +++ b/slides/slides/slide-0104.html @@ -0,0 +1,60 @@ + + + + Sort::Maker Synopsis + +

    1.4: Sort::Maker Synopsis

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 4/12 +
    +
    +
      +
    • Exports a single sub 'make_sorter' + +
    • Each key has its own description + +
    • Call make_sorter with sort description arguments and it returns a + code reference. + +
    • Call that reference with unsorted input records + +
    • Returns sorted records +
      +	my $sorter = make_sorter(
      +		qw( plain ),
      +		number => [
      +			code => '/(\d+)/',
      +			'descending',
      +		],
      +	) ;
      +
      +	my @sorted = $sorter->( @unsorted ) ;
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 4/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0105.html b/slides/slides/slide-0105.html new file mode 100644 index 0000000..403237c --- /dev/null +++ b/slides/slides/slide-0105.html @@ -0,0 +1,51 @@ + + + + Key Descriptions + +

    1.5: Key Descriptions

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 5/12 +
    +
    +
      +
    • Each key has a description + +
    • Keys must be in the order they will sort (higher level keys must be + earlier) + +
    • The key type must be specified as 'string' or 'number' + +
    • Keys can have attributes such as 'descending' or 'case' + +
    • Attributes can have defaults set for all the keys + +
    • Default attributes can be overridden in any key description +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 5/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0106.html b/slides/slides/slide-0106.html new file mode 100644 index 0000000..ce5d414 --- /dev/null +++ b/slides/slides/slide-0106.html @@ -0,0 +1,48 @@ + + + + Extraction Code + +

    1.6: Extraction Code

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 6/12 +
    +
    +
      +
    • Each key needs code to extract it from a record + +
    • Each record is aliased to $_ (via map) + +
    • Key extraction code operates on $_ and gets the value for this key + +
    • The extraction code is executed in list context so m/(foo)/ works + +
    • The code is inside a do{} block so you can have multiple statements +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 6/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0107.html b/slides/slides/slide-0107.html new file mode 100644 index 0000000..b22b95a --- /dev/null +++ b/slides/slides/slide-0107.html @@ -0,0 +1,76 @@ + + + + Sort Styles + +

    1.7: Sort Styles

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 7/12 +
    +
    +
      +
    • Four different sorting styles to choose from + +
        +
      • Plain + +
      +
        +
      • Orcish Manouvre + +
      +
        +
      • Schwartian Transform (ST) + +
      +
        +
      • Guttman-Rosler Transform (GRT) + +
      +
    • Each has its uses and advantages + +
    • Styles are really different ways to cache extracted keys + +
    • Caching keys moves key extraction from O( N log N ) to O( N ) + +
    • In larger sizes of sort sets, caching keys is a very big win + +
    • This is a classic sort of arrays of numbers + +
        +
      • Compare this code to the generated code for the different sort styles + +
      +
      +	sort { $a->[0] cmp $b->[0] ||
      +	       $a->[1] cmp $b->[1]
      +	}
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 7/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0108.html b/slides/slides/slide-0108.html new file mode 100644 index 0000000..bb9c03a --- /dev/null +++ b/slides/slides/slide-0108.html @@ -0,0 +1,59 @@ + + + + Plain Sort + +

    1.8: Plain Sort

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 8/12 +
    +
    +
      +
    • No key caching + +
    • Similar to common sorts with a code block + +
    • Good for small sort sets as there is no caching + +
    • Pass 'plain' option to make_sorter +
      +	sub {
      +
      +		sort {
      +		do{ my( $left, $right ) = map { $_->[0] } $a, $b;
      +			$left cmp $right }
      +			||
      +		do{ my( $left, $right ) = map { $_->[1] } $a, $b;
      +			$left cmp $right }
      +
      +		} @_  ;
      +	}
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 8/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0109.html b/slides/slides/slide-0109.html new file mode 100644 index 0000000..ab7a1fa --- /dev/null +++ b/slides/slides/slide-0109.html @@ -0,0 +1,84 @@ + + + + Orcish Manouvre Sort + +

    1.9: Orcish Manouvre Sort

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 9/12 +
    +
    +
      +
    • Caches extracted keys in a hash + +
    • Assignments to the hash in ||= + +
    • Called the orchish because of Or-Cache + +
    • Created by Joseph Hall + +
    • It will re-extract keys for any record with a false value + +
    • The caching hash must be cleared beforehand + +
        +
      • Sort::Maker declares a caching hash in the anonymous sub + +
      +
    • Hash lookups are still O( N log N ) + +
    • Good for medium sort sets + +
    • Pass 'orcish' option to make_sorter +
      	sub {
      +		my %or_cache ;
      +
      +
      +		sort {
      +		(
      +		  ( $or_cache{$a} ||=
      +			do{ my ($val) = map { $_->[0] } $a ; $val } )
      +				cmp
      +		  ( $or_cache{$b} ||=
      +			do{ my ($val) = map { $_->[0] } $b ; $val } )
      +		)
      +			||
      +		(
      +		  ( $or_cache{$a} ||=
      +			do{ my ($val) = map { $_->[1] } $a ; $val } )
      +				cmp
      +		  ( $or_cache{$b} ||=
      +			do{ my ($val) = map { $_->[1] } $b ; $val } )
      +		)
      +
      +		} @_  ;
      +	}
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 9/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0110.html b/slides/slides/slide-0110.html new file mode 100644 index 0000000..d6c1788 --- /dev/null +++ b/slides/slides/slide-0110.html @@ -0,0 +1,68 @@ + + + + Schwartian Transform (ST) Sort + +

    1.10: Schwartian Transform (ST) Sort

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 10/12 +
    +
    +
      +
    • Caches extracted keys in an anonymous array for each input record + +
    • Stores the record itself in slot 0 of the array + +
    • Uses the map/sort/map idiom + +
    • Popularized by Randal Schwartz + +
    • Good for medium to large sort sets + +
    • Key extraction is O( N ) + +
    • Does a full callback to Perl in the comparison block + +
    • Pass 'ST' option to make_sorter +
      sub {
      +        map $_->[0],
      +        sort {
      +        $a->[1] cmp $b->[1]
      +                ||
      +        $a->[2] cmp $b->[2]
      +
      +        }
      +        map [ $_,
      +                do{ my ($val) = $_->[0] ; $val },
      +                do{ my ($val) = $_->[1] ; $val }
      +        ], @_  ;
      +}
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 10/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0111.html b/slides/slides/slide-0111.html new file mode 100644 index 0000000..d94332b --- /dev/null +++ b/slides/slides/slide-0111.html @@ -0,0 +1,87 @@ + + + + Guttman-Rosler Transform (GRT) Sort + +

    1.11: Guttman-Rosler Transform (GRT) Sort

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 11/12 +
    +
    +
      +
    • Caches extracted keys in a single string for each input record + +
    • String records can be stored in that string + +
    • Reference records have their index stored in the cache string + +
        +
      • The sorted indices are used to slice into the input array to create + sorted records + +
      +
    • Key extraction is O( N ) + +
    • The comparison code is internal and uses C with no Perl callback (big + win) + +
    • More complex to use properly and efficiently + +
    • GRT has special key description attributes for optimization + +
        +
      • Numbers can be integer/float, signed/unsigned + +
      +
        +
      • Strings can be fixed/varying + +
      +
        +
      • Fastest sort style for larger sets of sort records + +
      +
    • Pass 'GRT' option to make_sorter + +
      sub {
      +        my $rec_ind = 0 ;
      +
      +        return  @_[
      +            map unpack( 'N', substr( $_, -4 ) ), 
      +            sort
      +            map pack( "Z*Z*N",
      +                do{ my( $val ) = $_->[0] ; ( $val ) },
      +                do{ my( $val ) = $_->[1] ; ( $val ) },
      +                $rec_ind++
      +            ), @_
      +        ] ;
      +}
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 11/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/slide-0112.html b/slides/slides/slide-0112.html new file mode 100644 index 0000000..3bb3916 --- /dev/null +++ b/slides/slides/slide-0112.html @@ -0,0 +1,83 @@ + + + + Tests and Benchmarks + +

    1.12: Tests and Benchmarks

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page 12/12 +
    +
    +
      +
    • Test system is table driven + +
    • Scripts can do both tests and benchmarks + +
    • Provide a set of data + +
        +
      • Manually + +
      +
        +
      • Generate in place with map + +
      +
        +
      • Generate with a anonymous sub + +
      +
    • Provide a hand written 'golden' sort sub + +
    • Provide the arguments to make_sorter() + +
    • Many tests are in but more are wanted + +
        +
      • Send in tests if you want + +
      +
    • More complex tests are needed + +
      +	{
      +		skip	=> 0,
      +		name	=> 'arrays of multiple strings',
      +		source	=> 1,
      +		data	=> [ map {
      +				[ rand_token( 8, 20 ), rand_token( 8, 20 ), ]
      +			} 1 .. 100
      +		],
      +		gold	=> sub { $a->[0] cmp $b->[0] ||
      +				 $a->[1] cmp $b->[1] },
      +		args	=> [ qw( string $_->[0] string $_->[1] ) ],
      +	},
      +
      +
    +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page 12/12 +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/slides/toc.html b/slides/slides/toc.html new file mode 100644 index 0000000..f65a70e --- /dev/null +++ b/slides/slides/toc.html @@ -0,0 +1,13 @@ + + + +Table of Contents for Sort::Maker + + +

    Table of Contents for Sort::Maker +

    +

    +

    + diff --git a/slides/sort_maker.txt b/slides/sort_maker.txt new file mode 100644 index 0000000..1d11f2b --- /dev/null +++ b/slides/sort_maker.txt @@ -0,0 +1,271 @@ + +book: Sort::Maker +chapter: Sort::Maker + +title: A Little Past History + +*: TPC3 in Monterey, CA 1999 +*: Co-wrote and presented "A Fresh Look at Efficient Perl Sorting" +html: + http://www.sysarch.com/perl/sort_paper.html +*: Co-author was Larry Rosler (retired from HP and Perl) +*: Introduced the GRT as a way to speed up Perl sorts +*: It won the prize for Best Technical Paper +*: A module Sort::Records was promised which would implement the GRT +*: Sort::Records was started but a code and design review showed many + problems. +*: Sort::Records was shelved during development because of a code review +*: People still write to me asking about that module + +PAGE_END + +title: Recent History + +*: I have been thinking about this module for 5 years +*: Solved the major design flaw - have the user provide key extraction + code +*: A thread in perl6-language gave needed inspiration +*: Sort::Maker emerges +*: Much help in the API design and some coding tricks from Damian Conway +PAGE_END + +title: Introducing Sort::Maker + +*: Easy description of sorts +*: Generates a sort sub +*: Choice of generated sort styles +*: Can generate the highest speed sorts in Perl +*: Generated source can be printed +**: For study +**: For pasting in code to save the generation time overhead +*: These slides can be found at + +html: + http://www.stemsystems.com/slides/slides/index.html
    +*: The current tarball is at + +html: + http://www.stemsystems.com/sort/Sort-Maker-0.01.tar.gz
    + +PAGE_END + +title: Sort::Maker Synopsis + +*: Exports a single sub 'make_sorter' +*: Each key has its own description +*: Call make_sorter with sort description arguments and it returns a + code reference. +*: Call that reference with unsorted input records +*: Returns sorted records + +code: + + my $sorter = make_sorter( + qw( plain ), + number => [ + code => '/(\d+)/', + 'descending', + ], + ) ; + + my @sorted = $sorter->( @unsorted ) ; + +PAGE_END + +title: Key Descriptions + +*: Each key has a description +*: Keys must be in the order they will sort (higher level keys must be + earlier) +*: The key type must be specified as 'string' or 'number' +*: Keys can have attributes such as 'descending' or 'case' +*: Attributes can have defaults set for all the keys +*: Default attributes can be overridden in any key description + +PAGE_END + +title: Extraction Code + +*: Each key needs code to extract it from a record +*: Each record is aliased to $_ (via map) +*: Key extraction code operates on $_ and gets the value for this key +*: The extraction code is executed in list context so m/(foo)/ works +*: The code is inside a do{} block so you can have multiple statements + +PAGE_END + +title: Sort Styles + +*: Four different sorting styles to choose from +**: Plain +**: Orcish Manouvre +**: Schwartian Transform (ST) +**: Guttman-Rosler Transform (GRT) +*: Each has its uses and advantages +*: Styles are really different ways to cache extracted keys +*: Caching keys moves key extraction from O( N log N ) to O( N ) +*: In larger sizes of sort sets, caching keys is a very big win +*: This is a classic sort of arrays of numbers +**: Compare this code to the generated code for the different sort styles +code: + + sort { $a->[0] cmp $b->[0] || + $a->[1] cmp $b->[1] + } + +PAGE_END + +title: Plain Sort + +*: No key caching +*: Similar to common sorts with a code block +*: Good for small sort sets as there is no caching +*: Pass 'plain' option to make_sorter + +code: + + sub { + + sort { + do{ my( $left, $right ) = map { $_->[0] } $a, $b; + $left cmp $right } + || + do{ my( $left, $right ) = map { $_->[1] } $a, $b; + $left cmp $right } + + } @_ ; + } + +PAGE_END + +title: Orcish Manouvre Sort + +*: Caches extracted keys in a hash +*: Assignments to the hash in ||= +*: Called the orcish because of Or-Cache +*: Created by Joseph Hall +*: It will re-extract keys for any record with a false value +*: The caching hash must be cleared beforehand +**: Sort::Maker declares a caching hash in the anonymous sub +*: Hash lookups are still O( N log N ) +*: Good for medium sort sets +*: Pass 'orcish' option to make_sorter + +code: + sub { + my %or_cache ; + + + sort { + ( + ( $or_cache{$a} ||= + do{ my ($val) = map { $_->[0] } $a ; $val } ) + cmp + ( $or_cache{$b} ||= + do{ my ($val) = map { $_->[0] } $b ; $val } ) + ) + || + ( + ( $or_cache{$a} ||= + do{ my ($val) = map { $_->[1] } $a ; $val } ) + cmp + ( $or_cache{$b} ||= + do{ my ($val) = map { $_->[1] } $b ; $val } ) + ) + + } @_ ; + } + +PAGE_END + +title: Schwartian Transform (ST) Sort + +*: Caches extracted keys in an anonymous array for each input record +*: Stores the record itself in slot 0 of the array +*: Uses the map/sort/map idiom +*: Popularized by Randal Schwartz +*: Good for medium to large sort sets +*: Key extraction is O( N ) +*: Does a full callback to Perl in the comparison block +*: Pass 'ST' option to make_sorter + +code: +sub { + map $_->[0], + sort { + $a->[1] cmp $b->[1] + || + $a->[2] cmp $b->[2] + + } + map [ $_, + do{ my ($val) = $_->[0] ; $val }, + do{ my ($val) = $_->[1] ; $val } + ], @_ ; +} + +PAGE_END + +title: Guttman-Rosler Transform (GRT) Sort + +*: Caches extracted keys in a single string for each input record +*: String records can be stored in that string +*: Reference records have their index stored in the cache string +**: The sorted indices are used to slice into the input array to create + sorted records +*: Key extraction is O( N ) +*: The comparison code is internal and uses C with no Perl callback (big + win) +*: More complex to use properly and efficiently +*: GRT has special key description attributes for optimization +**: Numbers can be integer/float, signed/unsigned +**: Strings can be fixed/varying +**: Fastest sort style for larger sets of sort records +*: Pass 'GRT' option to make_sorter +code: +sub { + my $rec_ind = 0 ; + + return @_[ + map unpack( 'N', substr( $_, -4 ) ), + sort + map pack( "Z*Z*N", + do{ my( $val ) = $_->[0] ; ( $val ) }, + do{ my( $val ) = $_->[1] ; ( $val ) }, + $rec_ind++ + ), @_ + ] ; +} + +PAGE_END + +title: Tests and Benchmarks + +*: Test system is table driven +*: Scripts can do both tests and benchmarks +*: Provide a set of data +**: Manually +**: Generate in place with map +**: Generate with a anonymous sub +*: Provide a hand written 'golden' sort sub +*: Provide the arguments to make_sorter() +*: Many tests are in but more are wanted +**: Send in tests if you want +*: More complex tests are needed +code: + + { + skip => 0, + name => 'arrays of multiple strings', + source => 1, + data => [ map { + [ rand_token( 8, 20 ), rand_token( 8, 20 ), ] + } 1 .. 100 + ], + gold => sub { $a->[0] cmp $b->[0] || + $a->[1] cmp $b->[1] }, + args => [ qw( string $_->[0] string $_->[1] ) ], + }, + +PAGE_END + diff --git a/slides/templates/bullet.tmpl b/slides/templates/bullet.tmpl new file mode 100644 index 0000000..ebd7300 --- /dev/null +++ b/slides/templates/bullet.tmpl @@ -0,0 +1 @@ +
  • <%TEXT%> diff --git a/slides/templates/bullet2.tmpl b/slides/templates/bullet2.tmpl new file mode 100644 index 0000000..ebd7300 --- /dev/null +++ b/slides/templates/bullet2.tmpl @@ -0,0 +1 @@ +
  • <%TEXT%> diff --git a/slides/templates/code.tmpl b/slides/templates/code.tmpl new file mode 100644 index 0000000..484c492 --- /dev/null +++ b/slides/templates/code.tmpl @@ -0,0 +1 @@ +
    <%TEXT%>
    \ No newline at end of file diff --git a/slides/templates/footer.tmpl b/slides/templates/footer.tmpl new file mode 100644 index 0000000..214b3d5 --- /dev/null +++ b/slides/templates/footer.tmpl @@ -0,0 +1,19 @@ +
    + + + + + + + +
    + Prev + Next + Index + + YAPC::NA 2004, Buffalo, NY + Page <%PAGE_NUM%>/<%TOTAL_PAGES%> +
    + © 2004 Uri Guttman +
    + \ No newline at end of file diff --git a/slides/templates/header.tmpl b/slides/templates/header.tmpl new file mode 100644 index 0000000..308bb0d --- /dev/null +++ b/slides/templates/header.tmpl @@ -0,0 +1,18 @@ + + + +<%TITLE%> + +

    <%CHAP_NUM%>.<%SLIDE_NUM%><%CHAP_ABBREV%>: <%TITLE%>

    + + + +
    + Prev + Next + Index + + Sort::Maker + Page <%PAGE_NUM%>/<%TOTAL_PAGES%> +
    +
    diff --git a/slides/templates/para.tmpl b/slides/templates/para.tmpl new file mode 100644 index 0000000..6535451 --- /dev/null +++ b/slides/templates/para.tmpl @@ -0,0 +1,3 @@ +

    +<%TEXT%> +

    diff --git a/t/GRT.t b/t/GRT.t new file mode 100644 index 0000000..c1aca96 --- /dev/null +++ b/t/GRT.t @@ -0,0 +1,151 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'simple string', + data => [ qw( z e a k ) ], + gold => sub { $a cmp $b }, + args => [ qw( string ) ], + }, + { + skip => 0, + name => 'unsigned integer', + data => [ 32, 2, 9, 7 ], + gold => sub { $a <=> $b }, + args => [ qw( unsigned number ) ], + }, + { + skip => 0, + name => 'unsigned integer descending', + data => [ 32, 2, 9, 7 ], + gold => sub { $b <=> $a }, + args => [ qw( unsigned number descending ) ], + }, + { + skip => 0, + name => 'signed integer', + data => [ 32, -2, 9, -7 ], + gold => sub { $a <=> $b }, + args => [ qw( signed number ) ], + }, + { + skip => 0, + name => 'signed integer descending', + data => [ 32, -2, 9, -7 ], + gold => sub { $b <=> $a }, + args => [ qw( signed number descending ) ], + }, + { + skip => 0, + name => 'unsigned float', + data => [ 32, 2, 9, 7 ], + gold => sub { $a <=> $b }, + args => [ qw( unsigned_float number ) ], + }, + { + skip => 0, + name => 'unsigned float descending', + data => [ 32, 2, 9, 7.0, 7.1 ], + gold => sub { $b <=> $a }, + args => [ qw( unsigned_float number descending ) ], + }, + { + skip => 0, + name => 'signed float', + data => [ 32, -2, 9, -7 ], + gold => sub { $a <=> $b }, + args => [ qw( signed_float number ) ], + }, + { + skip => 0, + name => 'signed float descending', + data => [ 32, -2, 9, -7.0, -7.1 ], + gold => sub { $b <=> $a }, + args => [ qw( signed_float number descending ) ], + }, + { + skip => 0, + name => 'plain string', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ) ], + gold => sub { $a cmp $b }, + args => [ qw( string ) ], + }, + { + skip => 0, + name => 'plain string no_case', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ) ], + gold => sub { uc $a cmp uc $b }, + args => [ qw( no_case string ) ], + }, + { + skip => 0, + name => 'fixed string', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { $a cmp $b }, + args => [ qw( string fixed 6 ) ], + }, + { + skip => 0, + name => 'string no_case fixed', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { uc $a cmp uc $b }, + args => [ qw( string no_case fixed 6 ) ], + }, + { + skip => 0, + name => 'string descending fixed', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { $b cmp $a }, + args => [ qw( string descending fixed 6 ) ], + }, + { + skip => 0, + name => 'string no_case descending fixed', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { uc $b cmp uc $a }, + args => [ qw( string no_case descending fixed 6 ) ], + }, + + { + skip => 0, + name => 'varying string', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { $a cmp $b }, + args => [ qw( string varying ) ], + }, + { + skip => 0, + name => 'string no_case varying', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { uc $a cmp uc $b }, + args => [ qw( string no_case varying ) ], + }, + { + skip => 0, + name => 'string descending varying', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { $b cmp $a }, + args => [ qw( string descending varying ) ], + }, + { + skip => 0, + name => 'string no_case descending varying', + data => [ qw( bdhd BDhd wxj ayewwq rjjx ), "w\0j" ], + gold => sub { uc $b cmp uc $a }, + args => [ qw( string no_case descending varying ) ], + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/arrays.t b/t/arrays.t new file mode 100755 index 0000000..ae30c1a --- /dev/null +++ b/t/arrays.t @@ -0,0 +1,60 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my $sort_styles = [ qw( plain orcish ST GRT ) ] ; + +my $sort_tests = [ + + { + skip => 0, + name => 'arrays of strings', + data => [ map { + [ rand_token( 8, 20 ) ] + } 1 .. 100 + ], + gold => sub { $a->[0] cmp $b->[0] }, + args => [ qw( string $_->[0] ) ], + }, + { + skip => 0, + name => 'arrays of numbers', + data => [ map { + [ rand_number( 1, 20 ) ] + } 1 .. 100 + ], + gold => sub { $a->[0] <=> $b->[0] }, + args => [ qw( number $_->[0] ) ], + }, + { + skip => 0, + name => 'arrays of multiple strings', + source => 0, + data => [ map { + [ rand_token( 8, 20 ), rand_token( 8, 20 ), ] + } 1 .. 100 + ], + gold => sub { $a->[0] cmp $b->[0] || + $a->[1] cmp $b->[1] }, + args => [ qw( string $_->[0] string $_->[1] ) ], + }, + { + skip => 0, + name => 'arrays of multiple numbers', + data => [ map { + [ rand_number( 1, 20 ), rand_number( 1, 20 ) ] + } 1 .. 100 + ], + gold => sub { $a->[0] <=> $b->[0] || + $a->[1] <=> $b->[1] }, + args => [ qw( number $_->[0] number $_->[1] ) ], + }, +] ; + +common_driver( $sort_tests, $sort_styles ) ; + +exit ; diff --git a/t/bad_code.t b/t/bad_code.t new file mode 100644 index 0000000..3be38a7 --- /dev/null +++ b/t/bad_code.t @@ -0,0 +1,18 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; + +use Test::More tests => 2 ; + +use Sort::Maker qw( :all ) ; + +my $err = make_sorter( 'plain', string => [] ) ; +ok( !$err, 'bad extraction code - array ref' ) ; + +$err = make_sorter( 'GRT', number => \'foo' ) ; +ok( !$err, 'bad extraction code - scalar ref' ) ; + + diff --git a/t/closure.t b/t/closure.t new file mode 100644 index 0000000..2649528 --- /dev/null +++ b/t/closure.t @@ -0,0 +1,113 @@ +#!/usr/local/bin/perl -s + +use strict ; +use warnings ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +use vars '$bench' ; + +#my @sort_styles = qw( plain ) ; +my @sort_styles = qw( plain orcish ST GRT ) ; + +# These are some months to be sorted numerically. +my @months = qw( + January + February + March + April + May + June + July + August + September + October + November + December +) ; + +# a jumbled array of months to be sorted + +my @month_jumble = qw(February June October March January + April July November August December May September); + +my %month_to_num ; +@month_to_num{ @months } = 1 .. @months ; + +# These are some months to be sorted alphabetically +# (order determined by letters). + +my %month_to_let ; +@month_to_let{ @months } = 'A' .. 'L' ; + +my $sort_tests = [ + + { + skip => 0, + name => 'closure error', + gold => sub { $month_to_num{$a} <=> $month_to_num{$b} }, + error => qr/Global symbol "%month_to_num"/, + args => { + number => [ + number => sub { $month_to_num{$_} }, + ], + }, + }, + + { + skip => 0, + source => 0, + name => 'closure numeric', + data => \@month_jumble, + gold => sub { $month_to_num{$a} <=> $month_to_num{$b} }, + args => { + number => [ + 'closure', + number => sub { $month_to_num{$_} }, + ], + }, + }, + + { + skip => 0, + source => 0, + name => 'closure string', + data => \@month_jumble, + gold => sub { $month_to_let{$a} cmp $month_to_let{$b} }, + args => { + string => [ + 'closure', + string => sub { $month_to_let{$_} }, + ], + }, + }, + + { + skip => 0, + name => 'double closure', + data => [ + [ qw( November March ) ], + [ qw( January March ) ], + [ qw( July June ) ], + [ qw( January January ) ], + ], + gold => sub { + $month_to_let{$a->[0]} cmp $month_to_let{$b->[0]} + || + $month_to_num{$a->[1]} <=> $month_to_num{$b->[1]} + }, + args => { + double => [ + 'closure', + string => sub { $month_to_let{$_->[0]} }, + number => sub { $month_to_num{$_->[1]} }, + ], + }, + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/code.t b/t/code.t new file mode 100644 index 0000000..41ca9a9 --- /dev/null +++ b/t/code.t @@ -0,0 +1,61 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +use vars '$bench' ; + +#my @sort_styles = qw( plain ) ; +my @sort_styles = qw( plain orcish ST GRT ) ; + +#my @string_keys = map rand_alpha( 4, 8 ), 1 .. 5 ; +my @string_keys = map rand_alpha( 4, 8 ), 1 .. 100 ; + +#print "STR @string_keys NUM @number_keys\n" ; + +my $sort_tests = [ + { + skip => 0, + name => 'regex code', + gen => sub { rand_choice( @string_keys ) }, + gold => sub { ($a =~ /(\w+)/)[0] cmp ($b =~ /(\w+)/)[0] }, + args => { + string => [ qw( string /(\w+)/ ) ], + qr => [ string => qr/(\w+)/ ], + code => [ string => sub { /(\w+)/ } ], + } + }, + { + skip => 0, + name => 'array code', + data => [ map { + [ rand_token( 8, 20 ) ] + } 1 .. 100 + ], + gold => sub { $a->[0] cmp $b->[0] }, + args => { + string => [ qw( string $_->[0] ) ], + code => [ string => sub { $_->[0] } ], + } + }, + { + skip => 0, + name => 'hash code', + data => [ map { + { a => rand_token( 8, 20 ) } + } 1 .. 100 + ], + gold => sub { $a->{a} cmp $b->{a} }, + args => { + string => [ qw( string $_->{a} ) ], + code => [ string => sub { $_->{a} } ], + } + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/common.pm b/t/common.pm new file mode 100644 index 0000000..1ac5082 --- /dev/null +++ b/t/common.pm @@ -0,0 +1,313 @@ +use strict ; + +use Data::Dumper ; + +use Test::More ; +use Benchmark ; + +use Sort::Maker qw( :all ) ; + +use vars '$bench' ; + +sub common_driver { + + my( $sort_tests, $sort_styles, $default_sizes ) = @_ ; + + if ( $bench ) { + + benchmark_driver( $sort_tests, $sort_styles, $default_sizes ) ; + return ; + } + + test_driver( $sort_tests, $sort_styles ) ; +} + +sub test_driver { + + my( $sort_tests, $default_styles ) = @_ ; + + $default_styles ||= [] ; + + my $total_tests = count_tests( $sort_tests, $default_styles ) ; + + plan tests => $total_tests ; + + foreach my $test ( @{$sort_tests} ) { + + if ( $test->{skip} ) { + + SKIP: { + skip( "sort of $test->{name}\n", + $test->{count} ) ; + } + next ; + } + + make_test_sorters( $test, $default_styles ) ; + + if ( $test->{error} ) { + + handle_errors( $test ) ; + next ; + } + + $test->{data} ||= generate_data( $test ) ; + +#print Dumper $test->{data} ; + + run_tests( $test ) ; + } +} + +sub handle_errors { + + my( $test ) = @_ ; + + foreach my $sort_name ( sort test_name_cmp keys %{$test->{sorters}} ) { + +#print "NAME $sort_name\n" ; + if ( my $error = $test->{make_error}{$sort_name} ) { + + if ( $test->{error} && $error =~ /$test->{error}/ ) { + + ok( 1, "$sort_name sort of $test->{name}" ) ; + } + else { + + ok( 0, "$sort_name sort of $test->{name}" ) ; + print "unexpected error:\n$@\n" ; + } + } + } +} + +sub run_tests { + + my( $test ) = @_ ; + + my $input = $test->{data} ; + + my @gold_sorted = sort { $test->{gold}->() } @{$input} ; + + foreach my $sort_name ( sort test_name_cmp keys %{$test->{sorters}} ) { + + my @sorter_in = $sort_name =~ /ref_in/ ? $input : @{$input} ; + + my $sorter = $test->{sorters}{$sort_name} ; + my @test_sorted = $sorter->( @sorter_in ) ; + @test_sorted = @{$test_sorted[0]} if $sort_name =~ /ref_out/ ; + + my $ok = eq_array( \@gold_sorted, \@test_sorted ) ; + +print "TEST [@test_sorted]\n" unless $ok ; +print "GOLD [@gold_sorted]\n" unless $ok ; + + ok( $ok, "$sort_name sort of $test->{name}" ) ; + } +} + +sub test_name_cmp { + + my @a = split /_/, $a ; + my @b = split /_/, $b ; + + lc $a[0] cmp lc $b[0] + || + lc $a[1] cmp lc $b[1] + || + lc $a[2] cmp lc $b[2] +} + +sub benchmark_driver { + + my( $sort_tests, $default_styles, $default_sizes ) = @_ ; + + my $duration = shift @ARGV || -2 ; + + foreach my $test ( @{$sort_tests} ) { + + next if $test->{skip} ; + + $test->{input_sets} = [generate_data( $test, $default_sizes )] ; + + make_test_sorters( $test, $default_styles ) ; + + run_benchmarks( $test, $duration ) ; + } +} + +sub run_benchmarks { + + my( $test, $duration ) = @_ ; + + my( %entries, @input, $in_ref ) ; + + while( my( $name, $sorter ) = each %{$test->{sorters}} ) { + + $entries{ $name } = $name =~ /ref_in/ ? + sub { my @sorted = $sorter->( $in_ref ) } : + sub { my @sorted = $sorter->( @input ) } ; + } + + $entries{ 'gold' } = + sub { my @sorted = sort { $test->{gold}->() } @input } ; + + foreach my $input_set ( @{$test->{input_sets}} ) { + + my $size = @{$input_set} ; + + print "Sorting $size elements of '$test->{name}'\n" ; + + @input = @{$input_set} ; + $in_ref = $input_set ; + + timethese( $duration, \%entries ) ; + } +} + +sub generate_data { + + my( $test, $default_sizes ) = @_ ; + + my $gen_code = $test->{gen} ; + $gen_code or die "no 'gen' code for test $test->{name}" ; + + my @sizes = @{ $test->{sizes} || $default_sizes || [100] } ; + +# return a single data set when called in scalar context (from test_driver) + + return [ map $gen_code->(), 1 .. shift @sizes ] unless wantarray ; + +# return multiple data sets when called in list context (from benchmark_driver) + + return map [ map $gen_code->(), 1 .. $_ ], @sizes ; +} + +sub make_test_sorters { + + my( $test, $default_styles ) = @_ ; + + my $styles = $test->{styles} || $default_styles ; + +# if no styles, we need a dummy style just to force the style loop + + $styles = [ qw(NO_STYLE) ] unless @{$styles} ; + + my $suffix = ( $test->{ref_in} ? '_RI' : '' ) . + ( $test->{ref_out} ? '_RO' : '' ) ; + + my $args = $test->{args} or die "$test->{name} has no args\n" ; + my $arg_sets = ( ref $args eq 'HASH' ) ? $args : { '' => $args } ; + + foreach my $arg_name ( sort keys %{$arg_sets} ) { + + my $test_args = $arg_sets->{$arg_name} ; + + foreach my $style ( @{$styles} ) { + + my $sort_name = $arg_name ? + "${style}_$arg_name" : "$style$suffix" ; + +# if no real styles, use an empty list for them + + my @style_args = $style eq 'NO_STYLE' ? () : $style ; + + my $sorter = make_sorter( @style_args, @{$test_args} ) ; + +#print "sorter [$sorter]\n" ; +#print sorter_source( $sorter ) ; + + +#print "SOURCE $test->{source}\n" ; + + unless( $sorter ) { + +#print "SORT $sort_name [$@]\n" ; + + $test->{make_error}{$sort_name} = $@ ; + $test->{sorters}{$sort_name} = 'NONE' ; + next ; + } + + print "Source of $sort_name $test->{name} is:\n", + sorter_source( $sorter ) if $test->{source} ; + + $test->{sorters}{$sort_name} = $sorter ; + } + } + +# all sorters built ok + + return 1 ; +} + +sub count_tests { + + my( $tests, $default_styles ) = @_ ; + + my $sum = 0 ; + + foreach my $test ( @{$tests} ) { + + my $style_count = @{ $test->{styles} || $default_styles } || 1 ; + + my $arg_sets_count = ref $test->{args} eq 'ARRAY' ? + 1 : keys %{$test->{args}} ; + + my $test_count = $style_count * $arg_sets_count ; + $test->{count} = $test_count ; + + $sum += $test_count ; + } + + return $sum ; +} + +my @alpha_digit = ( 'a' .. 'z', 'A' .. 'Z', '0' .. '9' ) ; +my @alpha = ( 'a' .. 'z', 'A' .. 'Z' ) ; +my @bytes = ( "\x00" .. "\xff" ) ; + +sub rand_token { + + rand_string( \@alpha_digit, @_ ) ; +} + +sub rand_alpha { + + rand_string( \@alpha, @_ ) ; +} + +sub rand_bytes { + + rand_string( \@bytes, @_ ) ; +} + +sub rand_string { + + my( $char_set, $min_len, $max_len ) = @_ ; + + $min_len ||= 8 ; + $max_len ||= $min_len ; + + my $length = $min_len + int rand( $max_len - $min_len + 1 ) ; + + return join '', map $char_set->[rand @{$char_set}], 1 .. $length ; +} + +sub rand_number { + + my( $lo_range, $hi_range ) = @_ ; + + ( $lo_range, $hi_range ) = ( 0, $lo_range ) unless $hi_range ; + + my $range = $hi_range - $lo_range ; + + return rand( $range ) + $lo_range ; +} + +sub rand_choice { + + return @_[rand @_] ; +} + +1 ; diff --git a/t/descending_grt_string.t b/t/descending_grt_string.t new file mode 100644 index 0000000..18e91fe --- /dev/null +++ b/t/descending_grt_string.t @@ -0,0 +1,37 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +use vars qw( $a $b ) ; + +my @sort_styles = qw( GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'descending fixed - numeric data', + data => [ 'dog', 10, 'camel', 2 ], + gold => sub { $b cmp $a }, + args => { + + fixed => [ qw( string descending fixed 3 ) ], + varying => [ qw( string descending varying ) ], + }, + }, +] ; + +our $bench ; + +if ( $bench ) { + benchmark_driver( $sort_tests, \@sort_styles ) ; +} +else { + test_driver( $sort_tests, \@sort_styles ) ; +} + +exit ; diff --git a/t/errors.t b/t/errors.t new file mode 100644 index 0000000..12678b0 --- /dev/null +++ b/t/errors.t @@ -0,0 +1,142 @@ +#!/usr/local/bin/perl + +use strict ; +use warnings ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my $sort_tests = [ + + { + name => 'unknown option', + args => { + unknown => [ qw( xxx ), ], + }, + error => qr/unknown/i, + }, + + { + name => 'no keys', + styles => [ qw( plain ) ], + args => { + no_keys => [], + }, + error => qr/no keys/i, + }, + + { + name => 'duplicate style', + args => { + dup_style => [qw( GRT ST ) ], + }, + error => qr/style was already set/i, + }, + + { + name => 'no value', + args => { + no_value => [ qw( name ) ], + }, + error => qr/no value/i, + }, + { + name => 'no style', + args => { + no_style => [ qw( string ) ], + }, + error => qr/no sort style/i, + }, + + { + name => 'ascending and descending', + styles => [ qw( plain ) ], + args => { + up_and_down => [ + string => { + ascending => 1, + descending => 1, + }, + ], + }, + error => qr/has ascending/i, + }, + + { + name => 'case and no case', + styles => [ qw( plain ) ], + args => { + up_and_down => [ + string => { + case => 1, + no_case => 1, + }, + ], + }, + error => qr/has case/, + }, + + { + name => 'illegal code', + styles => [ qw( plain ) ], + args => { + illegal => [ + string => 'XXX', + ], + }, + error => qr/compile/, + }, + + { + name => 'GRT descending string', + styles => [ qw( GRT ) ], + args => { + GRT => [ + qw( string descending ) + ], + }, + error => qr/descending string/, + }, + + { + name => 'array args - no value', + styles => [ qw( ST ) ], + args => { + array => [ + qw( ref_in ref_out ), + number => [ + qw( + descending + unsigned_float + ), + 'code', + ], + ], + }, + error => qr/No value/i, + }, + + { + name => 'array args - unknown attribute', + styles => [ qw( ST ) ], + args => { + array => [ + + number => [ + qw( + descending + unsigned_float + ), + 'foobar', + ], + ], + }, + error => qr/Unknown attribute/, + }, + +] ; + +common_driver( $sort_tests ) ; + +exit ; diff --git a/t/hashes.t b/t/hashes.t new file mode 100755 index 0000000..2c1dcc4 --- /dev/null +++ b/t/hashes.t @@ -0,0 +1,61 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( plain orcish ST GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'hashes of strings', + data => [ map { + { a => rand_token( 8, 20 ) } + } 1 .. 100 + ], + gold => sub { $a->{a} cmp $b->{a} }, + args => [ qw( string $_->{a} ) ], + }, + { + skip => 0, + name => 'hashes of numbers', + data => [ map { + { a => rand_number( 1, 20 ) } + } 1 .. 100 + ], + gold => sub { $a->{a} <=> $b->{a} }, + args => [ qw( number $_->{a} ) ], + }, + { + skip => 0, + name => 'hashes of multiple strings', + data => [ map { + { a => rand_token( 8, 20 ), + b => rand_token( 8, 20 ), } + } 1 .. 100 + ], + gold => sub { $a->{a} cmp $b->{a} || + $a->{b} cmp $b->{b} }, + args => [ qw( string $_->{a} string $_->{b} ) ], + }, + { + skip => 0, + name => 'hashes of multiple numbers', + data => [ map { + { a => rand_number( 1, 20 ), + b => rand_number( 1, 20 ) } + } 1 .. 100 + ], + gold => sub { $a->{a} <=> $b->{a} || + $a->{b} <=> $b->{b} }, + args => [ qw( number $_->{a} number $_->{b} ) ], + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/init_code.t b/t/init_code.t new file mode 100755 index 0000000..236d79b --- /dev/null +++ b/t/init_code.t @@ -0,0 +1,72 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( ST GRT ) ; + +my @string_keys = map rand_alpha( 4, 8 ), 1 .. 10 ; +my @number_keys = map int rand_number( 100, 10000 ), 1 .. 10 ; + +my $sort_tests = [ + + { + skip => 0, + source => 0, + name => 'init_code', + sizes => [400, 1000], + gen => sub { rand_choice( @string_keys ) . ':' . + rand_choice( @number_keys ) }, + gold => sub { + ($a =~ /^(\w+)/)[0] cmp ($b =~ /^(\w+)/)[0] + || + ($a =~ /(\d+$)/)[0] <=> ($b =~ /(\d+$)/)[0] + }, + args => { + init_code => [ + init_code => 'my( $str, $num ) ;', + string => + 'do{( $str, $num ) = /^(\w+):(\d+)$/; $str}', + number => '$num', + ], + no_init => [ + string => '/^(\w+)/', + number => '/(\d+)$/' + ], + }, + }, + { + skip => 0, + source => 0, + name => 'deep init_code', + sizes => [400, 1000], + gen => sub { [[{'a' => rand_choice( @string_keys ) . ':' . + rand_choice( @number_keys )}]] }, + gold => sub { + ($a->[0][0]{a} =~ /^(\w+)/)[0] cmp + ($b->[0][0]{a} =~ /^(\w+)/)[0] + || + ($a->[0][0]{a} =~ /(\d+$)/)[0] <=> + ($b->[0][0]{a} =~ /(\d+$)/)[0] + }, + args => { + init_code => [ + init_code => 'my( $str, $num ) ;', + string => 'do{( $str, $num ) = + $_->[0][0]{a} =~ /^(\w+):(\d+)$/; $str}', + number => '$num', + ], + no_init => [ + string => '$_->[0][0]{a} =~ /^(\w+)/', + number => '$_->[0][0]{a} =~ /(\d+$)/', + ], + }, + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/io.t b/t/io.t new file mode 100755 index 0000000..e5d4ad2 --- /dev/null +++ b/t/io.t @@ -0,0 +1,53 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( plain orcish ST GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'simple string', + data => [ qw( z e a k ) ], + gold => sub { $a cmp $b }, + sizes => [ 100, 1000 ], +# sizes => [ 5000 ], + gen => sub { rand_token() }, + args => { + default => [ qw( string ) ], + ref_in => [ qw( ref_in string ) ], + ref_out => [ qw( ref_out string ) ], + ref_in_ref_out => [ qw( ref_in ref_out string ) ], + }, + }, + { + skip => 0, + name => 'simple number', + data => [ 32, 2, 9, 7 ], + gold => sub { $a <=> $b }, + sizes => [ 100, 1000 ], + gen => sub { rand_number( 10 ) }, + args => { + default => [ qw( number ) ], + ref_in => [ qw( ref_in number ) ], + ref_out => [ qw( ref_out number ) ], + ref_in_ref_out => [ qw( ref_in ref_out number ) ], + }, + }, +] ; + +our $bench ; + +if ( $bench ) { + benchmark_driver( $sort_tests, \@sort_styles ) ; +} +else { + test_driver( $sort_tests, \@sort_styles ) ; +} + +exit ; diff --git a/t/name.t b/t/name.t new file mode 100644 index 0000000..97832d7 --- /dev/null +++ b/t/name.t @@ -0,0 +1,22 @@ +#!/usr/local/bin/perl -sw + +use strict ; +use Test::More tests => 2 ; + +use Sort::Maker ; + +my $sorter = make_sorter( name => 'sort_func', 'plain', number => 1 ) ; + +#print "$@\n" unless $sorter ; + +my @input = ( 10, 3, 40, 18 ) ; + +my @sorted = sort_func( @input ) ; + +ok( 1, 'sort name export' ) ; + +my $ok = eq_array( \@input, \@sorted ) ; + +ok( $ok, 'sort number' ) ; + +exit ; diff --git a/t/numbers.t b/t/numbers.t new file mode 100755 index 0000000..925575e --- /dev/null +++ b/t/numbers.t @@ -0,0 +1,142 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( plain orcish ST GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'unsigned float', + gold => sub { $a <=> $b }, + gen => sub { + rand_number( 0, 99999 ) * + 10 ** rand_number( -10, 3 ) + }, + args => [ qw( number unsigned_float ) ], + }, + { + skip => 0, + name => 'signed float', + gold => sub { $a <=> $b }, + gen => sub { + rand_number( -99999, 99999 ) * + 10 ** rand_number( -10, 3 ) + }, + args => [ qw( number signed_float ) ], + }, + { + skip => 0, + name => 'unsigned integer', + gold => sub { $a <=> $b }, + gen => sub { int rand_number( 0, 99999999 ) }, + args => [ qw( number unsigned ) ], + }, + { + skip => 0, + name => 'signed integer', + gold => sub { $a <=> $b }, + gen => sub { int rand_number( -99999, 99999 ) }, + args => [ qw( number signed ) ], + }, + { + skip => 0, + name => 'unsigned float edge case', + gold => sub { $a <=> $b }, + data => [ reverse 0 .. 100 ], + args => [ qw( number unsigned_float ) ], + }, + { + skip => 0, + name => 'signed float edge case', + gold => sub { $a <=> $b }, + data => [ reverse -100 .. 100 ], + args => [ qw( number signed_float ) ], + }, + { + skip => 0, + name => 'unsigned integer edge case', + gold => sub { $a <=> $b }, + data => [ reverse 0 .. 100 ], + args => [ qw( number unsigned ) ], + }, + { + skip => 0, + name => 'signed integer edge case', + gold => sub { $a <=> $b }, + data => [ -99999, 0, -999 .. 999, 99999 ], + args => [ qw( number signed ) ], + }, + { + skip => 0, + name => 'unsigned float descending', + gold => sub { $b <=> $a }, + gen => sub { + rand_number( 0, 99999 ) * + 10 ** rand_number( -10, 3 ) + }, + args => [ qw( number unsigned_float descending ) ], + }, + { + skip => 0, + name => 'signed float descending', + gold => sub { $b <=> $a }, + gen => sub { + rand_number( -99999, 99999 ) * + 10 ** rand_number( -10, 3 ) + }, + args => [ qw( number signed_float descending ) ], + }, + { + skip => 0, + name => 'unsigned integer descending', + gold => sub { $b <=> $a }, + gen => sub { int rand_number( 0, 99999999 ) }, + args => [ qw( number unsigned descending) ], + }, + { + skip => 0, + name => 'signed integer descending', + gold => sub { $b <=> $a }, + gen => sub { int rand_number( -99999, 99999 ) }, + args => [ qw( number signed descending ) ], + }, + { + skip => 0, + name => 'unsigned float edge case descending', + gold => sub { $b <=> $a }, + data => [ reverse 0 .. 100 ], + args => [ qw( number unsigned_float descending ) ], + }, + { + skip => 0, + name => 'signed float edge case descending', + gold => sub { $b <=> $a }, + data => [ reverse -100 .. 100 ], + args => [ qw( number signed_float descending ) ], + }, + { + skip => 0, + name => 'unsigned integer edge case descending', + gold => sub { $b <=> $a }, + data => [ reverse 0 .. 100 ], + args => [ qw( number unsigned descending ) ], + }, + { + skip => 0, + name => 'signed integer edge case descending', + gold => sub { $b <=> $a }, + data => [ -99999, 0, -999 .. 999, 99999 ], + args => [ qw( number signed descending ) ], + }, + +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/ref_in_varying.t b/t/ref_in_varying.t new file mode 100644 index 0000000..8b8007b --- /dev/null +++ b/t/ref_in_varying.t @@ -0,0 +1,60 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +use vars qw( $a $b ) ; + +my @sort_styles = qw( GRT ) ; + +my $sort_tests = [ + + { + skip => 0, +# source => 1, + name => 'ref_in varying - max length', + data => [ map { + { a => rand_token( 20, 30 ), + b => rand_token( 20, 30 ), } + } 1 .. 5 + ], + gold => sub { $a->{a} cmp $b->{a} || + $a->{b} cmp $b->{b} }, + args => { + ref_in => [ + qw( string $_->{a} string $_->{b} ref_in varying ) + ], + }, + }, + { + skip => 0, + name => 'ref_in varying descending - max length', + data => [ map { + { a => rand_token( 20, 30 ), + b => rand_token( 20, 30 ), } + } 1 .. 5 + ], + gold => sub { $b->{a} cmp $a->{a} || + $b->{b} cmp $a->{b} }, + args => { + ref_in => [ + qw( string $_->{a} string $_->{b} ref_in + varying descending ) + ], + }, + }, +] ; + +our $bench ; + +if ( $bench ) { + benchmark_driver( $sort_tests, \@sort_styles ) ; +} +else { + test_driver( $sort_tests, \@sort_styles ) ; +} + +exit ; diff --git a/t/regex.t b/t/regex.t new file mode 100644 index 0000000..9b1ced2 --- /dev/null +++ b/t/regex.t @@ -0,0 +1,32 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( plain orcish ST GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'regex string', + gen => sub { rand_token() }, + gold => sub { ($a =~ /(\w+)/)[0] cmp ($b =~ /(\w+)/)[0] }, + args => [ qw( string /(\w+)/ ) ], + }, + { + skip => 0, + source => 0, + name => 'qr string', + gen => sub { rand_token() }, + gold => sub { ($a =~ /(\w+)/)[0] cmp ($b =~ /(\w+)/)[0] }, + args => [ string => qr/(\w+)/ ], + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/simple.t b/t/simple.t new file mode 100644 index 0000000..9812433 --- /dev/null +++ b/t/simple.t @@ -0,0 +1,31 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +my @sort_styles = qw( plain orcish ST GRT ) ; + +my $sort_tests = [ + + { + skip => 0, + name => 'simple string', + data => [ qw( z e a k ) ], + gold => sub { $a cmp $b }, + args => [ qw( string ) ], + }, + { + skip => 0, + name => 'simple number', + data => [ 32, 2, 9, 7 ], + gold => sub { $a <=> $b }, + args => [ qw( number ) ], + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ; diff --git a/t/sort.t b/t/sort.t new file mode 100644 index 0000000..a2ff5d4 --- /dev/null +++ b/t/sort.t @@ -0,0 +1,88 @@ +#!/usr/local/bin/perl -w + +use strict ; + +use Test::More ; +use Carp ; + +my $ok ; + +use Sort::Maker qw( sorter_source ) ; + + +BEGIN{ + + $ok = use_ok( 'Sort::Maker' ) ; +print "OK [$ok] $@\n" ; + +} +#print "FOO\n" ; + +my @sort_tests = ( + + { + skip => 0, + name => 'simple string', + data => [ qw( z e a k ) ], + sort => sub { $a cmp $b }, + opts => [ qw( string ) ], + }, + { + skip => 0, + name => 'simple number', + data => [ 32, 2, 9, 7 ], + sort => sub { $a <=> $b }, + opts => [ qw( number ) ], + }, + { + skip => 0, + name => 'simple string ref_in', + data => [ qw( z e a k ) ], + sort => sub { $a cmp $b }, + opts => [ qw( ref_in string ) ], + ref_in => 1, + }, + { + skip => 0, + name => 'simple string ref_out', + data => [ qw( z e a k ) ], + sort => sub { $a cmp $b }, + opts => [ qw( ref_out string ) ], + ref_out => 1, + }, +) ; + +plan tests => @sort_tests * 2 ; + +foreach my $test ( @sort_tests ) { + + next if $test->{skip} ; + + my( $name, $data, $sort, $opts ) = @{$test}{qw( name data sort opts )} ; + + my @sorted = sort { $sort->() } @{$data} ; + +#print "gold SORTED [@sorted]\n" ; + + foreach my $style ( qw( plain ST ) ) { + + my $sorter = make_sorter( $style, @{$opts} ) ; + +print "SORT [$sorter] [$@]\n" ; + +print sorter_source( $sorter ) || '' ; + +die unless $sorter ; + + my @in = $test->{ref_in} ? $data : @{$data} ; + + my @test_sorted = $sorter->( @in ) ; + + @test_sorted = @{$test_sorted[0]} if $test->{ref_out} ; + +#print "style SORTED [@test_sorted]\n" ; + + ok( eq_array( \@sorted, \@test_sorted ), + "$style sort of $name" ) ; + } +} diff --git a/t/string_data.t b/t/string_data.t new file mode 100755 index 0000000..c2c3df6 --- /dev/null +++ b/t/string_data.t @@ -0,0 +1,104 @@ +#!/usr/local/bin/perl -sw + +use strict ; + +use lib 't' ; +use lib '..' ; +require 'common.pm' ; + +use vars '$bench' ; + +my @sort_styles = qw( GRT ) ; + +my @string_keys = map rand_alpha( 4, 8 ), 1 .. 100 ; +my @number_keys = map int( rand_number( 100, 10000 ) ), 1 .. 100 ; + +#print "STR @string_keys NUM @number_keys\n" ; + +my $sort_tests = [ + { + skip => 0, + name => 'simple string', + sizes => [100, 1000], + gen => sub { rand_choice( @string_keys ) }, + gold => sub { $a cmp $b }, + args => { + string => [ qw( string_data string ) ], + index => [ qw( string ) ], + } + }, + { + skip => 0, + name => 'simple string no-case', + sizes => [100, 1000], + gen => sub { rand_choice( @string_keys ) }, + gold => sub { uc($a) cmp uc($b) }, + args => { + string => [ qw( string_data string no_case ) ], + index => [ qw( string no_case ) ], + } + }, + { + skip => 0, + source => 0, + name => 'simple string descending', + sizes => [100, 1000], + gen => sub { rand_choice( @string_keys ) }, + gold => sub { $b cmp $a }, + args => { + string => [ qw( string_data string + descending varying ) ], + index => [ qw( string descending varying ) ], + } + }, + { + skip => 0, + name => 'simple string no-case descending', + sizes => [100, 1000], + gen => sub { rand_choice( @string_keys ) }, + gold => sub { uc($b) cmp uc($a) }, + args => { + string => [ qw( string_data string no_case + descending varying ) ], + index => [ qw( string no_case descending varying ) ], + } + }, + { + skip => 0, + name => 'simple number', + sizes => [100, 1000], + gen => sub { rand_choice( @number_keys ) }, + gold => sub { $a <=> $b }, + args => { + string => [ qw( string_data number ) ], + index => [ qw( number ) ], + } + }, + { + skip => 0, + source => 0, + sizes => [100, 1000], + name => 'string:number', + gen => sub { rand_choice( @string_keys ) . ':' . + rand_choice( @number_keys ) + }, + gold => sub { + ($a =~ /^(\w+)/)[0] cmp ($b =~ /^(\w+)/)[0] + || + ($a =~ /(\d+)$/)[0] <=> ($b =~ /(\d+)$/)[0] + }, + args => { + index => [ string => '/^(\w+)/', + number => '/(\d+)$/' + ], + string => [ 'string_data', + string => '/^(\w+)/', + number => '/(\d+)$/' + ], + }, + }, +] ; + +common_driver( $sort_tests, \@sort_styles ) ; + +exit ;