@InProceedings{agarwal96, author = {Agarwal, Ramesh C.}, title = {{A Super Scalar Sort Algorithm for RISC Processors}}, booktitle = {Proceedings of the 1996 ACM SIGMOD Conference}, year = {1996}, month = {June}, pages = {240-246}, keyword = {sorting, datamation, external, sequential}, abstract = {The compare and branch sequences required in a traditional sort algorithm can not efficiently exploit multiple execution units present in currently available high performance RISC processors. This is because of the long latency of the compare instructions and the sequential algorithm used in sorting. We have developed new sort algorithms which eliminate almost all the compares, provide functional parallelism which can be exploited by multiple execution units, significantly reduce the number of passes through keys, and improve data locality. These new algorithms outperform traditional sort algorithms by a large factor. We have implemented the Datamation sort benchmark using our new sort algorithm on a desktop IBM RS/6000 model 39H with IBM SSA 7133 disk drives.} } @InProceedings{arpacidusseau97:sort, author = {Arpaci-Dusseau, Andrea C. and Arpaci-Dusseau, Remzi H. and Culler, David E. and Hellerstein, Joseph M. and Patterson, David P.}, title = {{High-Performance Sorting on Networks of Workstations}}, booktitle = {Proceedings of the 1997 ACM SIGMOD Conference}, year = {1997}, pages = {243--254}, URL = {http://now.cs.berkeley.edu/NowSort/}, keyword = {sorting, parallel, network of workstations, cluster, external, datamation}, abstract = {We report the performance of NOW-Sort, a collection of sorting implementations on a Network of Workstations (NOW). We find that parallel sorting on NOWs is competitive to sorting on the large-scale SMPs that have traditionally held the performance records. On a 32-node cluster, we finish the Datamation benchmark in 2.41 seconds, and can sort 6.0 GB in just under one minute. On a smaller, better equipped, 8-node cluster, we run the Datamation in 2.92 seconds, and sort 1.4 GB in a minute. Our implementations can be applied to a variety of disk, memory, and processor configurations; we highlight salient issues for tuning each component of the system. Throughout the paper, we evaluate the use of commodity hardware and operating systems for parallel sorting, and note lessons that can be drawn when applying NOW technology to data-intensive applications.} } @InProceedings{batcher86, author = {Batcher, K.}, title = {{Sorting Networks and their Applications}}, booktitle = {Proceedings of the AFIPS Spring Joint Computing Conference}, year = {1986}, keyword = {sorting, parallel, internal} } @InProceedings{baugsto90, author = {Baugsto, B.A.W and Greipsland, J.F and Kamerbeek, J.}, title = {{Sorting Large Data Files on POMA}}, booktitle = {Proceedings of COMPAR-90 VAPPIV}, year = {1990}, month = {September}, pages = {536-547}, note = {Springer Verlag Lecture Notes No. 357}, keyword = {sorting, parallel, external}, abstract = {Reports on the results of the porting of a typical benchmark problem for distributed systems from its original platform the HC16 database machine to POOMA, the Parallel Object Oriented MAchine. The authors introduce the POOMA architecture and some of its rationale. They define the sorting problem and the specific algorithm. Next they deal with the C-version of the algorithm for POOMA and presents the results of a number of measurements. The POOL implementation and its results are covered. The paper ends with some conclusions.} } @TechReport{beck86, author = {Beck, Micah and Bitton, Dina and Wilkinson, W. Kevin}, title = {{Sorting Large Files on a Backend Multiprocessor}}, year = {1986}, month = {March}, number = {86-741}, institution = {Department of Computer Science, Cornell University}, keyword = {sorting, parallel, external}, abstract = {Sorting is one of the basic operations in any database system. In this paper the authors present two external sorting algorithms for hypercube database computers. The methods are based on partitioning of data according to partition values obtained through sampling of the data. One of the algorithms which is implemented at the HC16 database computer designed at The Norwegian Institute of Technology, is described in detail together with a performance evaluation and a presentation of some test results.} } @InProceedings{blelloch91, author = {Blelloch, G. and Leiserson, C. and Maggs, B.}, title = {{A Comparison of Sorting Algorithms for the Connection Machine CM-2}}, booktitle = {Proceedings of the ACM Symposium on Parallel Algorithms and Architectures}, year = {1991}, month = {July}, keyword = {sorting, parallel, internal}, abstract = {The authors have implemented three parallel sorting algorithms on the Connection Machine supercomputer model CM-2: Batcher's bitonic sort, a parallel radix sort, and a sample sort similar to Reif and Valiant's flashsort. They have also evaluated the implementation of many other sorting algorithms proposed in the literature. Their computational experiments show that the sample sort algorithm, which is a theoretically efficient 'randomized' algorithm is the fastest of the three algorithms on large data sets. On a 64 K-processor CM-2, their sample sort implementation can sort 32*10/sup 6/ 64-bit keys in 5.1 seconds, which is over 10 times faster than the CM-2 library sort. Their implementation of radix sort, although not as fast on large data sets, is deterministic, much simpler to code, faster with small keys, and faster on small data sets (few elements per processor). Their implementation of bitonic sort, which is pipelined to use all the hypercube wires simultaneously, is the least efficient of the three on large data sets, but is the most efficient on small data sets, and is considerably more space efficient. This paper analyzes the three algorithms in detail and discusses many practical issues that led the authors to the particular implementations.} } @InBook{culler94, author = {Culler, D. and Dusseau, A. and Martin, R. and Schauser, K.}, title = {{Portability and Performance for Parallel Processing}}, chapter = {4: Fast Parallel Sorting under LogP: from Theory to Practice}, year = {1994}, pages = {71-98}, publisher = {John Wiley \& Sons Ltd.}, keyword = {sorting, parallel, internal, parallel model, logp} } @Article{datamation85, author = {Anonymous}, title = {{A Measure of Transaction Processing Power}}, journal = {Datamation}, year = {1985}, volume = {31}, number = {7}, pages = {112-118}, note = {Also in {\em Readings in Database Systems}, M.H. Stonebraker ed., Morgan Kaufmann, San Mateo, 1989.}, keyword = {sorting, datamation, benchmark, workload}, abstract = {A measure of transaction processing power is needed-a standard means of quantifying and comparing the throughput and price/performance ratios of various transaction processing systems. This paper is an attempt by people active in transaction processing to record the folklore used to quantify transaction processing performance.} } @InProceedings{dewitt91, author = {Dewitt, D. and Naughton, J.F. and Schneider, D.A.}, title = {{Parallel Sorting on a Shared-Nothing Architecture using Probabilistic Splitting}}, booktitle = {Proceedings of the International Conference on Parallel and Distributed Information Systmes}, year = {1991}, keyword = {sorting, parallel, external}, abstract = {Comparing exact and probabilistic splitting for external sorting on a database. Model and experimental results from Gamma machine. Basically, the idea is to decide on a splitting vector, which defines $N$ buckets for an $N$-process program, and have each program read its initial segment of the data and send each element to the appropriate bucket (other process). All elements received are written to disks as small sorted runs. Then each process mergesorts its runs. Probabilistic split uses only a sample of the elements to define the vector.} } @Article{dusseau96:sort, author = {Dusseau, Andrea C. and Culler, David E. and Schauser, Klaus E. and Martin, Richard P.}, title = {{Fast Parallel Sorting Under LogP: Experience with the CM-5}}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {1996}, month = {August}, volume = {7}, number = {8}, pages = {791-805}, keyword = {sorting, parallel, internal, parallel model, logp}, abstract = {In this paper, we analyze four parallel sorting algorithms (bitonic, column, radix, and sample sort) with the LogP model. LogP characterizes the performance of modern parallel machines with a small set of parameters: the communication latency (L), overhead (o), bandwidth (g), and the number of processors (P). We develop implementations of these algorithms in Split-C, a parallel extension to C, and compare the performance predicted by LogP to actual performance on a CM-5 of 32 to 512 processors for a range of problem sizes. We evaluate the robustness of the algorithms by varying the distribution and ordering of the key values. We also briefly examine the sensitivity of the algorithms to the communication parameters. We show that the LogP model is a valuable guide in the development of parallel algorithms and a good predictor of implementation performance. The model encourages the use of data layouts which minimize communication and balanced communication schedules which avoid contention. With an empirical model of local processor performance, LogP predictions closely match observed execution times on uniformly distributed keys across a broad range of problem and machine sizes. We find that communication performance is oblivious to the distribution of the key values, whereas the local processor performance is not; some communication phases are sensitive to the ordering of keys due to contention. Finally, our analysis shows that overhead is the most critical communication parameter in the sorting algorithms.} } @TechReport{graefe90, author = {Graefe, Goetz}, title = {{Parallel External Sorting in Volcano}}, year = {1990}, month = {June}, number = {CU-CS-459}, institution = {Computer Science, University of Colorado at Boulder}, keyword = {sorting, parallel, external} } @InProceedings{hightower92, author = {Hightower, W. and Prins, J. and Reif, J.}, title = {{Implementations of Randomized Sorting on Large Parallel Machines}}, booktitle = {Symposium on Parallel Algorithms and Architectures}, year = {1992}, month = {June}, keyword = {sorting, parallel}, abstract = {Flashsort and Samplesort are related parallel sorting algorithms. Both utilize a sophisticated randomized sampling technique to form a splitter set, but Samplesort distributes the splitter set to each processor while Flashsort uses splitter-directed routing. In this paper we present B-Flashsort, a new hatched-routing variant of Flashsort designed to sort N>P values using P processors connected in a d-dimensional mesh and using constant space in addition to the input and output. The key advantage of the Flashsort approach over Samplesort is a decrease in memory requirements, by avoiding the broadcast of the splitter set to all processors. The practical advantage of B-Flashsort over Flashsort is that it replaces pipelined splitter-directed routing with a set of synchronous local communications and bounds recursion, while still being demonstrably efficient. The performance of B-Flashsort and Samplesort is compared using a parameterized analytic model to show that on a d-dimensional toroidal mesh B-Flashsort improves on Samplesort when (NIP)

M \log \min\{M/B, \log M\}/\log M$ and $\log M/B = o(\log M)$.} } @InProceedings{nyberg94, author = {Nyberg, Chris and Barclay, Tom and Cvetanovic, Zarka and Gray, Jim and Lomet, Dave}, title = {{AlphaSort: A RISC Machine Sort}}, booktitle = {Proceedings of 1994 ACM SIGMOD Conference}, year = {1994}, month = {May}, keyword = {sorting, parallel, external, datamation}, abstract = {A new sort algorithm, called AlphaSort, demonstrates that commodity processors and disks can handle commercial batch workloads. Using Alpha AXP processors, commodity memory, and arrays of SCSI disks, AlphaSort runs the industry-standard sort benchmark in seven seconds. This beats the best published record on a 32-cpu 32-disk hypercube by 8:1. On another benchmark, AlphaSort sorted more than a gigabyte in a minute. AlphaSort is a cache-sensitive memory-intensive sort algorithm. It uses file striping to get high disk bandwidth. It uses QuickSort to generate runs and uses replacement-selection to merge the runs. It uses shared memory multiprocessors to break the sort into subsort chores. Because startup times are becoming a significant part of the total time, we propose two new benchmarks: MinuteSort, how much can you sort in a minute, and DollarSort, how much can you sort for a dollar.} } @Article{reif87, author = {Reif, J.~H. and Valiant, L.~G.}, title = {{A Logarithmic time Sort for Linear Size Networks}}, journal = {Journal of the ACM}, year = {1987}, month = {January}, volume = {34}, number = {1}, pages = {60-76}, keyword = {sorting, parallel, internal}, abstract = {A randomized parallel algorithm that sorts on an N node network with constant valence in O(logN) time is given. More particularly, the algorithm sorts N items on an N-node cube-connected cycles graph, and, for some constant k, for all large enough alpha , it terminates within k alpha logN time with probability at least 1-N/sup - alpha /.} } @Article{salzberg90, author = {Salzberg, Betty and Tsukerman, Alex and Gray, Jim and Stewart, Michael and Uren, Susan and Vaughna, Bonnie}, title = {{FastSort; A Distributed Single-Input Single-Output External Sort}}, journal = {SIGMOD Record}, year = {1990}, month = {June}, volume = {19}, number = {2}, pages = {94-101}, keyword = {sorting, parallel, external, datamation}, abstract = {FastSort is an example of a parallel external sort. It executes on a loosely-coupled network of processors. Each processor, called a site, has its own main memory of up to 128 Megabytes, and its own disk drives, which are dedicated to sorting. The processors are connected by a local area network. FastSort assumes that the unsorted source file is at one site and sorts that file to another site. Partitions of the file may be shipped to intermediate sites for processing. The intermediate sites sort their parts in parallel. The paper outlines FastSort and classifies it using Graefe taxonomy. It then analyzes the method used to produce sorted runs, and looks in detail at the merging process. Experimental results are also given.} } @TechReport{thearling91, author = {Thearling, K. and Smith, S.}, title = {{An Improved Supercomputer Sorting Benchmark}}, year = {1991}, institution = {Thinking Machines Corporation}, keyword = {sorting, benchmark, internal} } @InProceedings{young95, author = {Young, Honesty and Swami, Arun}, title = {{The Parameterized Round-Robin Partitioned Algorithm for Parallel External Sort}}, booktitle = {Proceedings 9th International Parallel Processing Symposium}, year = {1995}, month = {April}, pages = {213-219}, address = {Santa Barbara, CA}, keyword = {sorting, parallel, external}, abstract = {In this paper, we present a new parameterized parallel sort algorithm, called Round-Robin Partitioned (or RRP), for the message passing (shared-nothing) architecture. This is a parameterized sort algorithm because a parameter is provided which can be used to determine the amount of memory used and to allocate differing amounts of work to different sets of sites. We utilize pipelining to hide disk I/O time, exploit high degrees of parallelism at all phases, apply sampling to determine the partition key values and use less memory than previous known methods while repairing the minimum number of physical I/Os. The basic version of the RRP algorithm is simple in terms of coding and complexity. It does not require disk I/O parallelism or data prefetch within a single process. We develop an analytical model for our algorithm and compare our sort algorithm with four other classes of external parallel sort algorithms. The RRP algorithm are shown to be superior to the other algorithms for almost all configurations.} } @InProceedings{zagha91, author = {Zagha, M. and Blelloch, G.}, title = {{Radix Sort for Vector Multiprocessors}}, booktitle = {Supercomputing}, year = {1991}, keyword = {sorting, internal, vector}, abstract = {The authors have designed a radix sort algorithm for vector multiprocessors and have implemented the algorithm on the CRAY-Y-MP. On one processor of the Y-MP the sort is over five times faster on large sorting problems than the optimized library sort provided by CRAY Research. On eight processors, an additional speedup of almost five is achieved, yielding a routine over 25 times faster than the library sort. Using the multiprocessor version, one can sort at a rate of 15 million 64-bit keys per second. This sorting algorithm is adapted from a data-parallel algorithm previously designed for the Connection Machine CM-2. To develop their version, the authors introduce three general techniques for mapping data-parallel algorithms onto vector multiprocessors. These techniques allow one to fully vectorize and parallelize the algorithm. The authors also derive equations that model the performance of the algorithm on the Y-MP. These equations are then used to optimize the radix size.} } @InProceedings{zhang96, author = {W. Zhang and P. Larson}, title = {{A Memory-Adaptive Sort (MASORT) for Database Systems}}, booktitle = {Proceedings of CASCON '96}, year = {1996}, month = {November}, address = {Toronto}, keyword = {sorting, parallel, external, databases} }