@InProceedings{agarwal96,
  author = {Agarwal, Ramesh C.},
  title = {{A Super Scalar Sort Algorithm for RISC Processors}},
  booktitle = {Proceedings of the 1996 ACM SIGMOD Conference},
  year = {1996},
  month = {June},
  pages = {240-246},
  keyword = {sorting, datamation, external, sequential},
  abstract = {The compare and branch sequences required in a traditional sort
  algorithm can not efficiently exploit multiple execution units present in
  currently available high performance RISC processors. This is because of the
  long latency of the compare instructions and the sequential algorithm used in
  sorting. We have developed new sort algorithms which eliminate almost all the
  compares, provide functional parallelism which can be exploited by multiple
  execution units, significantly reduce the number of passes through keys, and
  improve data locality. These new algorithms outperform traditional sort
  algorithms by a large factor. We have implemented the Datamation sort
  benchmark using our new sort algorithm on a desktop IBM RS/6000 model 39H
  with IBM SSA 7133 disk drives.}
}

@InProceedings{arpacidusseau97:sort,
  author = {Arpaci-Dusseau, Andrea C. and Arpaci-Dusseau, Remzi H. and Culler,
  David E. and Hellerstein, Joseph M. and Patterson, David P.},
  title = {{High-Performance Sorting on Networks of Workstations}},
  booktitle = {Proceedings of the 1997 ACM SIGMOD Conference},
  year = {1997},
  pages = {243--254},
  URL = {http://now.cs.berkeley.edu/NowSort/},
  keyword = {sorting, parallel, network of workstations, cluster, external,
  datamation},
  abstract = {We report the performance of NOW-Sort, a collection of sorting
  implementations on a Network of Workstations (NOW). We find that parallel
  sorting on NOWs is competitive to sorting on the large-scale SMPs that have
  traditionally held the performance records. On a 32-node cluster, we finish
  the Datamation benchmark in 2.41 seconds, and can sort 6.0 GB in just under
  one minute. On a smaller, better equipped, 8-node cluster, we run the
  Datamation in 2.92 seconds, and sort 1.4 GB in a minute. Our implementations
  can be applied to a variety of disk, memory, and processor configurations; we
  highlight salient issues for tuning each component of the system. Throughout
  the paper, we evaluate the use of commodity hardware and operating systems
  for parallel sorting, and note lessons that can be drawn when applying NOW
  technology to data-intensive applications.}
}

@InProceedings{batcher86,
  author = {Batcher, K.},
  title = {{Sorting Networks and their Applications}},
  booktitle = {Proceedings of the AFIPS Spring Joint Computing Conference},
  year = {1986},
  keyword = {sorting, parallel, internal}
}

@InProceedings{baugsto90,
  author = {Baugsto, B.A.W and Greipsland, J.F and Kamerbeek, J.},
  title = {{Sorting Large Data Files on POMA}},
  booktitle = {Proceedings of COMPAR-90 VAPPIV},
  year = {1990},
  month = {September},
  pages = {536-547},
  note = {Springer Verlag Lecture Notes No. 357},
  keyword = {sorting, parallel, external},
  abstract = {Reports on the results of the porting of a typical benchmark
  problem for distributed systems from its original platform the HC16 database
  machine to POOMA, the Parallel Object Oriented MAchine. The authors introduce
  the POOMA architecture and some of its rationale. They define the sorting
  problem and the specific algorithm. Next they deal with the C-version of the
  algorithm for POOMA and presents the results of a number of measurements. The
  POOL implementation and its results are covered. The paper ends with some
  conclusions.}
}

@TechReport{beck86,
  author = {Beck, Micah and Bitton, Dina and Wilkinson, W. Kevin},
  title = {{Sorting Large Files on a Backend Multiprocessor}},
  year = {1986},
  month = {March},
  number = {86-741},
  institution = {Department of Computer Science, Cornell University},
  keyword = {sorting, parallel, external},
  abstract = {Sorting is one of the basic operations in any database system. In
  this paper the authors present two external sorting algorithms for hypercube
  database computers. The methods are based on partitioning of data according
  to partition values obtained through sampling of the data. One of the
  algorithms which is implemented at the HC16 database computer designed at The
  Norwegian Institute of Technology, is described in detail together with a
  performance evaluation and a presentation of some test results.}
}

@InProceedings{blelloch91,
  author = {Blelloch, G. and Leiserson, C. and Maggs, B.},
  title = {{A Comparison of Sorting Algorithms for the Connection Machine
  CM-2}},
  booktitle = {Proceedings of the ACM Symposium on Parallel Algorithms and
  Architectures},
  year = {1991},
  month = {July},
  keyword = {sorting, parallel, internal},
  abstract = {The authors have implemented three parallel sorting algorithms on
  the Connection Machine supercomputer model CM-2: Batcher's bitonic sort, a
  parallel radix sort, and a sample sort similar to Reif and Valiant's
  flashsort. They have also evaluated the implementation of many other sorting
  algorithms proposed in the literature. Their computational experiments show
  that the sample sort algorithm, which is a theoretically efficient
  'randomized' algorithm is the fastest of the three algorithms on large data
  sets. On a 64 K-processor CM-2, their sample sort implementation can sort
  32*10/sup 6/ 64-bit keys in 5.1 seconds, which is over 10 times faster than
  the CM-2 library sort. Their implementation of radix sort, although not as
  fast on large data sets, is deterministic, much simpler to code, faster with
  small keys, and faster on small data sets (few elements per processor). Their
  implementation of bitonic sort, which is pipelined to use all the hypercube
  wires simultaneously, is the least efficient of the three on large data sets,
  but is the most efficient on small data sets, and is considerably more space
  efficient. This paper analyzes the three algorithms in detail and discusses
  many practical issues that led the authors to the particular
  implementations.}
}

@InBook{culler94,
  author = {Culler, D. and Dusseau, A. and Martin, R. and Schauser, K.},
  title = {{Portability and Performance for Parallel Processing}},
  chapter = {4: Fast Parallel Sorting under LogP: from Theory to Practice},
  year = {1994},
  pages = {71-98},
  publisher = {John Wiley \& Sons Ltd.},
  keyword = {sorting, parallel, internal, parallel model, logp}
}

@Article{datamation85,
  author = {Anonymous},
  title = {{A Measure of Transaction Processing Power}},
  journal = {Datamation},
  year = {1985},
  volume = {31},
  number = {7},
  pages = {112-118},
  note = {Also in {\em Readings in Database Systems}, M.H. Stonebraker ed.,
  Morgan Kaufmann, San Mateo, 1989.},
  keyword = {sorting, datamation, benchmark, workload},
  abstract = {A measure of transaction processing power is needed-a standard
  means of quantifying and comparing the throughput and price/performance
  ratios of various transaction processing systems. This paper is an attempt by
  people active in transaction processing to record the folklore used to
  quantify transaction processing performance.}
}

@InProceedings{dewitt91,
  author = {Dewitt, D. and Naughton, J.F. and Schneider, D.A.},
  title = {{Parallel Sorting on a Shared-Nothing Architecture using
  Probabilistic Splitting}},
  booktitle = {Proceedings of the International Conference on Parallel and
  Distributed Information Systmes},
  year = {1991},
  keyword = {sorting, parallel, external},
  abstract = {Comparing exact and probabilistic splitting for external sorting
  on a database. Model and experimental results from Gamma machine. Basically,
  the idea is to decide on a splitting vector, which defines $N$ buckets for an
  $N$-process program, and have each program read its initial segment of the
  data and send each element to the appropriate bucket (other process). All
  elements received are written to disks as small sorted runs. Then each
  process mergesorts its runs. Probabilistic split uses only a sample of the
  elements to define the vector.}
}

@Article{dusseau96:sort,
  author = {Dusseau, Andrea C. and Culler, David E. and Schauser, Klaus E. and
  Martin, Richard P.},
  title = {{Fast Parallel Sorting Under LogP: Experience with the CM-5}},
  journal = {IEEE Transactions on Parallel and Distributed Systems},
  year = {1996},
  month = {August},
  volume = {7},
  number = {8},
  pages = {791-805},
  keyword = {sorting, parallel, internal, parallel model, logp},
  abstract = {In this paper, we analyze four parallel sorting algorithms
  (bitonic, column, radix, and sample sort) with the LogP model. LogP
  characterizes the performance of modern parallel machines with a small set of
  parameters: the communication latency (L), overhead (o), bandwidth (g), and
  the number of processors (P). We develop implementations of these algorithms
  in Split-C, a parallel extension to C, and compare the performance predicted
  by LogP to actual performance on a CM-5 of 32 to 512 processors for a range
  of problem sizes. We evaluate the robustness of the algorithms by varying the
  distribution and ordering of the key values. We also briefly examine the
  sensitivity of the algorithms to the communication parameters. We show that
  the LogP model is a valuable guide in the development of parallel algorithms
  and a good predictor of implementation performance. The model encourages the
  use of data layouts which minimize communication and balanced communication
  schedules which avoid contention. With an empirical model of local processor
  performance, LogP predictions closely match observed execution times on
  uniformly distributed keys across a broad range of problem and machine sizes.
  We find that communication performance is oblivious to the distribution of
  the key values, whereas the local processor performance is not; some
  communication phases are sensitive to the ordering of keys due to contention.
  Finally, our analysis shows that overhead is the most critical communication
  parameter in the sorting algorithms.}
}


@TechReport{graefe90,
  author = {Graefe, Goetz},
  title = {{Parallel External Sorting in Volcano}},
  year = {1990},
  month = {June},
  number = {CU-CS-459},
  institution = {Computer Science, University of Colorado at Boulder},
  keyword = {sorting, parallel, external}
}

@InProceedings{hightower92,
  author = {Hightower, W. and Prins, J. and Reif, J.},
  title = {{Implementations of Randomized Sorting on Large Parallel Machines}},
  booktitle = {Symposium on Parallel Algorithms and Architectures},
  year = {1992},
  month = {June},
  keyword = {sorting, parallel},
  abstract = {Flashsort and Samplesort are related parallel sorting algorithms.
  Both utilize a sophisticated randomized sampling technique to form a splitter
  set, but Samplesort distributes the splitter set to each processor while
  Flashsort uses splitter-directed routing. In this paper we present
  B-Flashsort, a new hatched-routing variant of Flashsort designed to sort N>P
  values using P processors connected in a d-dimensional mesh and using
  constant space in addition to the input and output. The key advantage of the
  Flashsort approach over Samplesort is a decrease in memory requirements, by
  avoiding the broadcast of the splitter set to all processors. The practical
  advantage of B-Flashsort over Flashsort is that it replaces pipelined
  splitter-directed routing with a set of synchronous local communications and
  bounds recursion, while still being demonstrably efficient. The performance
  of B-Flashsort and Samplesort is compared using a parameterized analytic
  model to show that on a d-dimensional toroidal mesh B-Flashsort improves on
  Samplesort when (NIP)<P/(c/sub 1/log P +c/sub 2/dP/sup 1/d/+c/sub 3/), for
  machine-dependent parameters c/sub 1/, c/sub 2/, and c/sub 3/. Empirical
  confirmation of the analytical model is obtained through implementations on a
  MasPar MP-1 of Samplesort and two B-Flashsort variants.}
}

@Article{hoare62,
  author = {C. A. R. Hoare},
  title = {{Quicksort}},
  journal = {Computer Journal},
  year = {1962},
  volume = {5},
  number = {1},
  pages = {10-15},
  keyword = {sorting, internal}
}

@Book{knuth,
  title = {Sorting and Searching},
  author = {Knuth, Donald E.},
  year = {1973},
  publisher = {Addison-Wesley},
  keyword = {sorting, theory, external, internal}
}

@Article{leighton85,
  author = {Leighton, T.},
  title = {{Tight Bounds on the Complexity of Parallel Sorting}},
  journal = {IEEE Transactions on Computers},
  year = {1985},
  month = {April},
  keyword = {sorting, parallel, internal, theory, column sort},
  abstract = {The author proves tight upper and lower bounds on the number of
  processors and on the information transfer, wire area, and time needed to
  sort N numbers in a bounded-degree fixed-connection network. The most
  important new results are: (1) the construction of an N-node degree-3 network
  capable of sorting N numbers in O(log N) word steps; (2) a proof that any
  network capable of sorting N(7 log N)-bit numbers in T bit steps requires
  area A where AT/sup 2/= Omega (N/sup 2/ log/sup 2/ N); and (3) the
  construction of a 'small-constant-factor' bounded-degree network that sorts N
  Theta (log N)-bit numbers in T= Theta (log N) bit steps with A= Theta (N/sup
  2/) area.}
}

@InProceedings{li93,
  author = {Li, Xiqing and Linoff, Gordon and Smith, Stephen and Stanfill,
  Craig and Thearling, Kurt},
  title = {{A Practical External Sort for Shared Disk MPPs}},
  booktitle = {Proceedings of SUPERCOMPUTING '93},
  year = {1993},
  month = {November},
  pages = {666-675},
  keyword = {sorting, parallel, external},
  abstract = {An external sort has been implemented and analyzed for a shared
  disk MPP computer system. In this implementation, we have considered many
  real world constraints. Decision support functionality in database systems,
  for instance, often requires that external sorting be done in place on disk,
  support variable length records, and be restartable from any point of
  interruption with no loss of data. These three constraints, along with the
  more standard requirements of speed and stability, affect the choice and
  implementation of the external sorting algorithm. The implementation of the
  sample sort algorithm described here meets these requirements. Although
  written using high level file processing directives, the implementation sorts
  a 10 GB file in 1.5 h on a 64 processor Connection Machine CM-5 with a
  DataVault disk system.}
}

@TechReport{nodine92,
  author = {Nodine, Mark H. and Vitter, Jeffrey Scott},
  title = {{Optimal Deterministic Sorting in Parallel Memory Hierarchies}},
  year = {1992},
  month = {August},
  number = {CS-92-38},
  type = {Computer Science},
  institution = {Brown University},
  address = {Providence, Rhode Island},
  URL =
  {http://www.cs.brown.edu/publications/techreports/reports/CS-92-38.html},
  keyword = {sorting, parallel, external},
  abstract = {We present a general deterministic sorting strategy that is
  applicable to a wide variety of parallel memory hierarchies with parallel
  processors. The simplest incarnation of the strategy is an optimal
  deterministic algorithm called Balance Sort for external sorting on multiple
  disks with a single CPU. Balance Sort was the topic of a previous technical
  report. This technical report shows how to adapt Balance Sort to sort
  deterministically in parallel memory hierarchies. The algorithms so derived
  will be optimal for all parallel memory hierarchies for which an optimal
  algorithm is known for a single hierarchy. In the case of $D$ disks, $P$
  processors, block size $B$, and internal memory size $M$, they are optimal in
  terms of I/Os for any $P \leq M$ and $DB < M$, and in terms of internal
  processing time except when $P > M \log \min\{M/B, \log M\}/\log M$ and $\log
  M/B = o(\log M)$.}
}

@InProceedings{nyberg94,
  author = {Nyberg, Chris and Barclay, Tom and Cvetanovic, Zarka and Gray, Jim
  and Lomet, Dave},
  title = {{AlphaSort: A RISC Machine Sort}},
  booktitle = {Proceedings of 1994 ACM SIGMOD Conference},
  year = {1994},
  month = {May},
  keyword = {sorting, parallel, external, datamation},
  abstract = {A new sort algorithm, called AlphaSort, demonstrates that
  commodity processors and disks can handle commercial batch workloads. Using
  Alpha AXP processors, commodity memory, and arrays of SCSI disks, AlphaSort
  runs the industry-standard sort benchmark in seven seconds. This beats the
  best published record on a 32-cpu 32-disk hypercube by 8:1. On another
  benchmark, AlphaSort sorted more than a gigabyte in a minute. AlphaSort is a
  cache-sensitive memory-intensive sort algorithm. It uses file striping to get
  high disk bandwidth. It uses QuickSort to generate runs and uses
  replacement-selection to merge the runs. It uses shared memory
  multiprocessors to break the sort into subsort chores. Because startup times
  are becoming a significant part of the total time, we propose two new
  benchmarks: MinuteSort, how much can you sort in a minute, and DollarSort,
  how much can you sort for a dollar.}
}

@Article{reif87,
  author = {Reif, J.~H. and Valiant, L.~G.},
  title = {{A Logarithmic time Sort for Linear Size Networks}},
  journal = {Journal of the ACM},
  year = {1987},
  month = {January},
  volume = {34},
  number = {1},
  pages = {60-76},
  keyword = {sorting, parallel, internal},
  abstract = {A randomized parallel algorithm that sorts on an N node network
  with constant valence in O(logN) time is given. More particularly, the
  algorithm sorts N items on an N-node cube-connected cycles graph, and, for
  some constant k, for all large enough alpha , it terminates within k alpha
  logN time with probability at least 1-N/sup - alpha /.}
}

@Article{salzberg90,
  author = {Salzberg, Betty and Tsukerman, Alex and Gray, Jim and Stewart,
  Michael and Uren, Susan and Vaughna, Bonnie},
  title = {{FastSort; A Distributed Single-Input Single-Output External Sort}},
  journal = {SIGMOD Record},
  year = {1990},
  month = {June},
  volume = {19},
  number = {2},
  pages = {94-101},
  keyword = {sorting, parallel, external, datamation},
  abstract = {FastSort is an example of a parallel external sort. It executes
  on a loosely-coupled network of processors. Each processor, called a site,
  has its own main memory of up to 128 Megabytes, and its own disk drives,
  which are dedicated to sorting. The processors are connected by a local area
  network. FastSort assumes that the unsorted source file is at one site and
  sorts that file to another site. Partitions of the file may be shipped to
  intermediate sites for processing. The intermediate sites sort their parts in
  parallel. The paper outlines FastSort and classifies it using Graefe
  taxonomy. It then analyzes the method used to produce sorted runs, and looks
  in detail at the merging process. Experimental results are also given.}
}

@TechReport{thearling91,
  author = {Thearling, K. and Smith, S.},
  title = {{An Improved Supercomputer Sorting Benchmark}},
  year = {1991},
  institution = {Thinking Machines Corporation},
  keyword = {sorting, benchmark, internal}
}

@InProceedings{young95,
  author = {Young, Honesty and Swami, Arun},
  title = {{The Parameterized Round-Robin Partitioned Algorithm for Parallel
  External Sort}},
  booktitle = {Proceedings 9th International Parallel Processing Symposium},
  year = {1995},
  month = {April},
  pages = {213-219},
  address = {Santa Barbara, CA},
  keyword = {sorting, parallel, external},
  abstract = {In this paper, we present a new parameterized parallel sort
  algorithm, called Round-Robin Partitioned (or RRP), for the message passing
  (shared-nothing) architecture. This is a parameterized sort algorithm because
  a parameter is provided which can be used to determine the amount of memory
  used and to allocate differing amounts of work to different sets of sites. We
  utilize pipelining to hide disk I/O time, exploit high degrees of parallelism
  at all phases, apply sampling to determine the partition key values and use
  less memory than previous known methods while repairing the minimum number of
  physical I/Os. The basic version of the RRP algorithm is simple in terms of
  coding and complexity. It does not require disk I/O parallelism or data
  prefetch within a single process. We develop an analytical model for our
  algorithm and compare our sort algorithm with four other classes of external
  parallel sort algorithms. The RRP algorithm are shown to be superior to the
  other algorithms for almost all configurations.}
}

@InProceedings{zagha91,
  author = {Zagha, M. and Blelloch, G.},
  title = {{Radix Sort for Vector Multiprocessors}},
  booktitle = {Supercomputing},
  year = {1991},
  keyword = {sorting, internal, vector},
  abstract = {The authors have designed a radix sort algorithm for vector
  multiprocessors and have implemented the algorithm on the CRAY-Y-MP. On one
  processor of the Y-MP the sort is over five times faster on large sorting
  problems than the optimized library sort provided by CRAY Research. On eight
  processors, an additional speedup of almost five is achieved, yielding a
  routine over 25 times faster than the library sort. Using the multiprocessor
  version, one can sort at a rate of 15 million 64-bit keys per second. This
  sorting algorithm is adapted from a data-parallel algorithm previously
  designed for the Connection Machine CM-2. To develop their version, the
  authors introduce three general techniques for mapping data-parallel
  algorithms onto vector multiprocessors. These techniques allow one to fully
  vectorize and parallelize the algorithm. The authors also derive equations
  that model the performance of the algorithm on the Y-MP. These equations are
  then used to optimize the radix size.}
}

@InProceedings{zhang96,
  author = {W. Zhang and P. Larson},
  title = {{A Memory-Adaptive Sort (MASORT) for Database Systems}},
  booktitle = {Proceedings of CASCON '96},
  year = {1996},
  month = {November},
  address = {Toronto},
  keyword = {sorting, parallel, external, databases}
}


