% BibTeX bibliography file

@InProceedings{acharya97,
  author = {Acharya, Anurag and Edjlali, Guy and Saltz, Joel},
  title = {{The Utility of Exploiting Idle Workstations for Parallel
  Computation}},
  booktitle = {Proceedings of 1997 ACM Sigmetrics International Conference on
  Measurement and Modeling of Computer Systems},
  year = {1997},
  month = {June},
  address = {Seattle},
  keyword = {network of workstations, cluster, scheduling, idle, parallel}
}

@InProceedings{adve93,
  author = {Adve, Vikram~S. and Vernon, Mary~K.},
  title = {{The Influence of Random Delays on Parallel Execution Times}},
  booktitle = {Proceedings of the 1993 ACM SIGMETRICS Conference on Measurement
  and Modeling of Computer Systems},
  year = {1993},
  month = {May},
  pages = {61-73},
  keyword = {scheduling, parallel},
  abstract = {Stochastic models are widely used for the performance evaluation
  of parallel programs and systems. The stochastic assumptions in such models
  are intended to represent non-deterministic processing requirements as well
  as random delays due to inter-process communication and resource contention.
  The authors provide compelling analytical and experimental evidence that in
  current and foreseeable shared-memory programs, communication delays
  introduce negligible variance into the execution time between synchronization
  points. Furthermore, they show using direct measurements of variance that
  other sources of randomness, particularly non-deterministic computational
  requirements, also do not introduce significant variance in many programs.
  They then use two examples to demonstrate the implications of these results
  for parallel program performance prediction models, as well as for general
  stochastic models of parallel systems.}
}

@InProceedings{agrawal85,
  author = {Agrawal, R. and Ezzat, A.K.},
  title = {{Processor Sharing in Nest: A Network of Computer Workstations}},
  booktitle = {Proceedings of 1st International Conference on Computer
  Workstations},
  year = {1985},
  month = {November},
  keyword = {network of workstations, cluster, scheduling},
  abstract = {NEST is a research project aimed at creating a computing
  environment that consists of a network of highly autonomous, yet cooperating
  personal computer workstations and shared servers. An important aspect of
  this project is to provide processor sharing by creating a pool of compute
  servers in the network that may be used by the workstations to supplement
  their computing needs. Some processors are permanently designated to be the
  computer servers. In addition, through an advertisement mechanism, any
  workstation may make itself temporarily available for a specific duration of
  time to be used as a compute server. The design and implementation of the
  scheme for augmenting the UNIX system with the execution-location-independent
  remote execution capability is presented. This capability allows processes to
  be offloaded to the compute servers and preserves the execution environment
  of these processes as if they were still executing locally at the originating
  machine.}
}

@InProceedings{anderson92,
  author = {Thomas Anderson and Brian Bershad and Edward Lazowska and Henry
  Levy},
  title = {{Scheduler Activations: Effective Kernel Support for the User-Level
  Management of Parallelism}},
  booktitle = {ACM Transactions on Computer Systems},
  year = {1992},
  month = {February},
  pages = {53-79},
  keyword = {operating system, scheduling},
  abstract = {Threads are the vehicle for concurrency in many approaches to
  parallel programming. Threads can be supported either by the operating system
  kernel or by user-level library code in the application address space, but
  neither approach has been fully satisfactory. The paper addresses this
  dilemma. First, the authors argue that the performance of kernel threads is
  inherently worse than that of user-level threads, rather than this being an
  artifact of existing implementations; managing parallelism at the user level
  is essential to high-performance parallel computing. Next, they argue that
  the problems encountered in integrating user-level threads with other system
  services is a consequence of the lack of kernel support for user-level
  threads provided by contemporary multiprocessor operating systems; kernel
  threads are the wrong abstraction on which to support user-level management
  of parallelism. Finally, they describe the design, implementation, and
  performance of a new kernel interface and user-level thread package that
  together provide the same functionality as kernel threads without
  compromising the performance and flexibility advantages of user-level
  management of parallelism.}
}

@InProceedings{arpaci95:interact,
  author = {Arpaci, Remzi H. and Dusseau, Andrea C. and Vahdat, Amin M. and
  Liu, Lok T. and Anderson, Thomas E. and Patterson, David A.},
  title = {{The Interaction of Parallel and Sequential Workloads on a Network
  of Workstations}},
  booktitle = {Proceedings of ACM SIGMETRICS'95/PERFORMANCE'95 Joint
  International Conference on Measurement and Modeling of Computer Systems},
  year = {1995},
  month = {May},
  pages = {267-278},
  keyword = {network of workstations, cluster, parallel, scheduling},
  abstract = {This paper examines the plausibility of using a network of
  workstations (NOW) for a mixture of parallel and sequential jobs. Through
  simulations, our study examines issues that arise when combining these two
  workloads on a single platform. Starting from a dedicated NOW just for
  parallel programs, we incrementally relax uniprogramming restrictions until
  we have a multi-programmed, multi-user NOW for both interactive sequential
  users and parallel programs. We show that a number of issues associated with
  the distributed NOW environment (e.g., daemon activity, coscheduling skew)
  can have a small but noticeable effect on parallel program performance. We
  also find that efficient migration to idle workstations is necessary to
  maintain acceptable parallel application performance. Furthermore, we present
  a methodology for deriving an optimal delay time for recruiting idle machines
  for use by parallel programs; this recruitment threshold was just 3 minutes
  for the research cluster we measured. Finally, we quantify the effects of the
  additional parallel load upon interactive users by keeping track of the
  potential number of user delays in our simulations. When we limit the maximum
  number of delays per user, we can still maintain acceptable parallel program
  performance. In summary, we find that for our workloads a 2:1 rule applies: a
  NOW cluster of approximately 60 machines can sustain a 32-node parallel
  workload in addition to the sequential load placed upon it by interactive
  users.}
}

@InProceedings{ashok92,
  author = {Immaneni Ashok and John Zahorjan},
  title = {{Scheduling a Mixed Interactive and Batch Workload on a Parallel,
  Shared-Memory Supercomputer}},
  booktitle = {Proceedings of Supercomputing '92},
  year = {1992},
  month = {November},
  pages = {616--625},
  keyword = {scheduling, parallel},
  abstract = {The authors analyze three approaches to scheduling mixed batch
  and interactive workloads on a supercomputer: (i) fixed partition, in which
  memory resources are statically allocated between the workloads: (ii) no
  partition, in which the interactive workload preempts resources as needed
  from the batch workload, and (iii) no partition with grouped admission, in
  which the interactive workload preempts resources only when the number of
  waiting interactive jobs reaches a threshold value. The authors also
  investigate the potential benefits of using virtual memory to perform the
  automatic overlay of jobs too large to fit in the amount of real memory
  instantaneously available to them. Using analytic tools, they compare the
  different policies according to the average speedup achieved by the batch
  workload given that a mean interactive job response time objective must be
  met by each. They show that, under a wide variety of conditions, fixed
  partition performs better than the other policies.}
}

@Article{atallah92,
  author = {Atallah, Mikhail J. and Black, Christina Lock and Marinescu, Dan C.
  and Siegel, Howard Jay and Casavant, Thomas L.},
  title = {{Models and Algorithms for Co-scheduling Compute-Intensive Tasks on
  a Network of Workstations}},
  journal = {Journal of Parallel and Distrbuted Computing},
  year = {1992},
  volume = {16},
  pages = {319-327},
  keyword = {scheduling, parallel, network of workstations, cluster},
  abstract = {The problem of using the idle cycles of a number of high
  performance workstations, interconnected by a high speed network, for solving
  computationally intensive tasks is discussed. The classes of distributed
  applications examined require some form of synchronization among the
  subtasks, hence the need for coscheduling to guarantee that subtasks start at
  the same time and execute at the same pace on a group of workstations. A
  model of the system is presented that allows the definition of an objective
  function to be maximized. Then a quadratic time and linear space algorithm is
  derived for computing the optimal coschedule, for the given model and class
  of applications addressed.}
}

@Article{bernstein71,
  author = {Bernstein, A.J. and Sharp, J.C.},
  title = {{A Policy-Driven Scheduler for a Time Sharing System}},
  journal = {Communications of the ACM},
  year = {1971},
  month = {February},
  volume = {14},
  number = {2},
  pages = {74-78},
  keyword = {scheduling, time share, fair},
  abstract = {The services received by a process from a time-sharing operating
  system can be characterized by a resource count Sigma w/sub i/R/sub ij/ where
  R/sub ij/ is the number of units of service received by process j from
  resource i and w/sub i/ is the cost per unit of the service. Each class of
  users can be characterized by a policy function which specifies the amount of
  service a user who belongs to this class should receive as a function of
  time. Priority changes dynamically as a function of the difference between
  the service promised to the user by the policy function and the service user
  actually receives. A scheduling and swapping algorithm which keeps the
  resource count of each process above its policy function will provide the
  specified level of service. Overhead can be reduced by avoiding swaps of
  processes which have received at least this level of service. The algorithm
  has been implemented in a general purpose operating system, and it has
  provided significantly better service to interactive and to batch jobs than
  the previous scheduler.}
}

@InProceedings{braams95,
  author = {Bramms, Jan},
  title = {{Batch Class process scheduler for Unix SVR4}},
  booktitle = {Proceedings of Sigmetrics '95/ Performance '95},
  year = {1995},
  month = {May},
  pages = {301-302},
  note = {Extended Abstract},
  keyword = {scheduling, svr4},
  abstract = {This paper introduces the BC (Batch Class) process scheduler. It
  runs batch processes in their own class, with lower priority than the
  interactive processes. It schedules batch processes, on the basis of the
  available amounts of resources that are left over from processes in other
  classes, rather than its own resource usage. The paper also describes an
  actual implementation of the BC class and an example of the effects of its
  use.}
}

@InProceedings{chandra94,
  author = {Rohit Chandra and Scott Devine and Ben Verghese and Anoop Gupta and
  Mendel Rosenblum},
  title = {{Scheduling and Page Migration for Multiprocessor Computer
  Servers}},
  booktitle = {Proceedings of the 6th International Conference on Architectural
  Support for Programming Languages and Operating Systems},
  year = {1994},
  month = {October},
  pages = {12--24},
  keyword = {parallel, scheduling, migration, shared memory},
  abstract = {Several cache coherent shared memory multiprocessors have been
  developed that are scalable and offer a very tight coupling between the
  processing resources. They are therefore quite attractive for use as compute
  servers for multiprogramming and parallel application workloads. Process
  scheduling and memory management, however, remain challenging due to the
  distributed main memory found on such machines. The paper examines the
  effects of OS scheduling and page migration policies on the performance of
  such compute servers. Our experiments are done on the Stanford DASH, a
  distributed memory cache coherent multiprocessor. We show that for our
  multiprogramming workloads consisting of sequential jobs, the traditional
  Unix scheduling policy does very poorly. In contrast, a policy incorporating
  cluster and cache affinity along with a simple page migration algorithm
  offers up to twofold performance improvement. For our workloads consisting of
  multiple parallel applications, we compare space sharing policies that divide
  the processors among the applications to time slicing policies such as
  standard Unix or gang scheduling. We show that space sharing policies can
  achieve better processor utilization due to the operating point effect, but
  time slicing policies benefit strongly from user level data distribution. Our
  initial experience with automatic page migration suggests that policies based
  only on TLB miss information can be quite effective, and useful for
  addressing the data distribution problems of space sharing schedulers.}
}

@InProceedings{chiang94,
  author = {Su-Hui Chiang and Rajesh K. Mansharamani and Mary K. Vernon},
  title = {{Use of Application Characteristics and Limited Preemption for
  Run-To-Completion Parallel Processor Scheduling Policies}},
  booktitle = {Proceedings of the 1994 ACM SIGMETRICS Conference},
  year = {1994},
  month = {February},
  pages = {33--44},
  keyword = {parallel, scheduling},
  abstract = {The performance potential of run-to-completion (RTC) parallel
  processor scheduling policies is investigated by examining whether (1)
  application execution rate characteristics such as average parallelism (avg)
  and processor working set (pws) and/or (2) limited preemption can be used to
  improve the performance of these policies. We address the first question by
  comparing policies (previous as well as new) that differ only in whether or
  not they use execution rate characteristics and by examining a wider range of
  the workload parameter space than previous studies. We address the second
  question by comparing a simple two-level queueing policy with RTC scheduling
  in the second level queue against RTC policies that don't allow any
  preemption and against dynamic equiallocation (EQ).}
}

@TechReport{crovella91,
  author = {Crovella, Mark and Das, Prakash and Dubnicki, Czarek and LeBlanc,
  Thomas and Markatos, Evangelos},
  title = {{Multiprogramming on Multiprocessors}},
  year = {1991},
  month = {February},
  number = {385},
  institution = {University of Rochester, Computer Science Department},
  keyword = {parallel, scheduling},
  abstract = {Many solutions have been proposed to the problem of
  multiprogramming a multiprocessor. However, each has limited applicability or
  fails to address an important source of overhead. In addition, there has been
  little experimental comparison of the various solutions in the presence of
  applications with varying degrees of parallelism and synchronization. The
  authors explore the tradeoffs between three different approaches to
  multiprogramming a multiprocessor: time-slicing, coscheduling, and dynamic
  hardware partitions. They implemented applications that vary in the degree of
  parallelism, and the frequency and type of synchronization. They show that in
  most cases coscheduling is preferable to time-slicing. They also show that
  although there are cases where coscheduling is beneficial, dynamic hardware
  partitions do no worse, and will often do better. They conclude that under
  most circumstances, hardware partitioning is the best strategy for
  multiprogramming a multiprocessor, no matter how much parallelism
  applications employ or how frequently synchronization occurs.}
}

@TechReport{dusseau95,
  author = {Dusseau, Andrea C. and Arpaci, Remzi H. and Culler, David E.},
  title = {{Re-examining Scheduling and Communication in Parallel Programs}},
  year = {1994},
  month = {December},
  number = {UCB//CSD-95-881},
  type = {Computer Science},
  institution = {University of California, Berkley},
  keyword = {scheduling, communication, parallel, network of workstations,
  cluster}
}

@InProceedings{dusseau96:implicit,
  author = {Dusseau, Andrea C. and Arpaci, Remzi H. and Culler, David E.},
  title = {{Effective Distributed Scheduling of Parallel Workloads}},
  booktitle = {Proceedings of 1996 ACM Sigmetrics International Conference on
  Measurement and Modeling of Computer Systems},
  year = {1996},
  keyword = {scheduling, parallel, network of workstations, cluster,
  coscheduling},
  abstract = {We present a distributed algorithm for time-sharing parallel
  workloads that is competitive with coscheduling. Implicit scheduling allows
  each local scheduler in the system to make independent decisions that
  dynamically coordinate the scheduling of cooperating processes across
  processors. Of particular importance is the blocking algorithm which decides
  the action of a process waiting for a communication or synchronization event
  to complete. Through simulation of bulk-synchronous parallel applications, we
  find that a simple two-phase fixed-spin blocking algorithm performs well; a
  two-phase adaptive algorithm that gathers run-time data on barrier wait-times
  performs slightly better. Our results hold for a range of machine parameters
  and parallel program characteristics. These findings are in direct contrast
  to the literature that states explicit coscheduling is necessary for
  fine-grained programs. We show that the choice of the local scheduler is
  crucial, with a priority-based scheduler performing two to three times better
  than a round-robin scheduler. Overall, we find that the performance of
  implicit scheduling is near that of coscheduling (+/-35%), without the
  requirement of explicit, global coordination.}
}

@Article{eager86,
  author = {Derek L. Eager and Edward D. Lazowska and John Zahorjan},
  title = {{Adaptive Load Sharing in Homogeneous Distributed Systems}},
  journal = {IEEE Transactions on Software Engineering},
  year = {1986},
  month = {May},
  volume = {12},
  number = {5},
  pages = {662--675},
  keyword = {load balance, parallel, distributed, scheduling},
  abstract = {Rather than proposing a specific load sharing policy for
  implementation, the authors address the more fundamental question of the
  appropriate level of complexity for load sharing policies. It is shown that
  extremely simple adaptive load sharing policies, which collect very small
  amounts of system state information and which use this information in very
  simple ways, yield dramatic performance improvements. These policies in fact
  yield performance close to that expected from more complex policies whose
  viability is questionable. It is concluded that simple policies offer the
  greatest promise in practice, because of their combination of nearly optimal
  performance and inherent stability.}
}

@Article{eager89,
  author = {Derek L. Eager and John Zahorjan and Edward D. Lazowska},
  title = {{Speedup versus Efficiency in Parallel Systems}},
  journal = {IEEE Transactions on Computers},
  year = {1989},
  month = {March},
  volume = {38},
  number = {3},
  pages = {408--23},
  keyword = {scheduling, parallel},
  abstract = {The tradeoff between speedup and efficiency that is inherent to a
  software system is investigated. The extent to which this tradeoff is
  determined by the average parallelism of the software system, as contrasted
  with other, more detailed, characterizations, is shown. The extent to which
  both speedup and efficiency can simultaneously be poor is bound: it is shown
  that for any software system and any number of processors, the sum of the
  average processor utilization (i.e. efficiency) and the attained fraction of
  the maximum possible speedup must exceed one. Bounds are given on speedup and
  efficiency, and on the incremental benefit and cost of allocating additional
  processors. An explicit formulation, as well as bounds, are given for the
  location of the knee of the execution time-efficiency profile, where the
  benefit per unit cost is maximized.}
}

@InProceedings{efe92,
  author = {Efe, Kemal and Schaar, Margaret A.},
  title = {{Performance of Co-Scheduling on a Network of Workstations}},
  booktitle = {Proceedings of the 13th International Conference on Distributed
  Computing Systems},
  year = {1993},
  pages = {525-531},
  keyword = {network of workstations, cluster, parallel, scheduling},
  abstract = {In a set of high performance workstations connected by a network,
  many workstations may be underutilized by their owners. While each
  workstation may be primarily responsible for executing its owner's tasks with
  the highest priority, the unused processing capacity may be made available to
  computationally intensive tasks submitted externally to the system. Static
  co-scheduling for such an environment has been considered previously (M.J.
  Atallah et al., 1991), where the goal was to maximize the speedup by
  partitioning the task among many workstations. The authors consider the
  problem from the system point of view, and develop a queuing model and
  efficient algorithms to minimize the mean response time. The results obtained
  show that significant improvements in the mean response time can be obtained
  through co-scheduling over that of the M/M/m system where each task would be
  assigned to a single workstation as a whole.}
}

@InProceedings{epema95,
  author = {Epema, D.H.J.},
  title = {{An Analysis of Decay-Usage Scheduling in Multiprocessors}},
  booktitle = {Proceedings of Sigmetrics '95/Performance '95},
  year = {1995},
  month = {May},
  pages = {74-85},
  organization = {ACM},
  address = {Ottawa, Canada},
  keyword = {scheduling, parallel, time share},
  abstract = {Priority-ageing or decay-usage scheduling is a time-sharing
  scheduling policy capable of dealing with a workload of both interactive and
  batch jobs by decreasing the priority of a job when it acquires CPU time, and
  by increasing its priority when it does not use the CPU. In this paper we
  deal with a decay-usage scheduling policy in multiprocessor systems modeled
  after widely used systems. The priority of a job consists of a base priority
  and a time-dependent part based on processor usage. Because the priorities in
  our model are time dependent, a queueing-theoretic analysis, for instance for
  the mean response time, seems impossible. Still, it turns out that as a
  consequence of the scheduling policy, the shares of available CPU time
  obtained by jobs converge, and a deterministic analysis for these shares is
  feasible: for a fixed set of jobs with very large (infinite) processing
  demands, we derive the relation between their base priorities and their
  steady-state shares. In addition, we analyze the relation between the values
  of the parameters of the scheduler and the level of control it can exercise
  over the steady-state shares. We validate the model by simulations and by
  measurements of actual systems.}
}

@InProceedings{evans93,
  author = {Evans, Steve and Clarke, Kevin and Singleton, Dave and Smaalders,
  Bart},
  title = {{Optimizing Unix Resource Scheduling for User Interaction}},
  booktitle = {1993 Summer Usenix},
  year = {1993},
  month = {June},
  pages = {205-218},
  organization = {USENIX},
  keyword = {scheduling, interactive},
  abstract = {Techniques for improving system responsiveness for interactive
  end users of Unix workstations are explored. After a discussion of the
  current state of resource scheduling, a model is presented in which dynamic
  input from the human user is combined with data from user interaction
  software to supply a centralized manager with the information necessary to
  determine what processes are involved with interacting with the user at any
  given moment. This service then communicates this process set information to
  the kernel, which uses it to manage memory and CPU resource allocation on the
  behalf of the user. Experience with a prototype of this environment is
  reported. An argument for an interactive scheduling class is made, along with
  other infrastructure changes needed to take advantage of it.}
}

@Article{fayolle80,
  author = {Fayolle, G. and Mitrani, I. and Iasnogorodski, R.},
  title = {{Sharing a Processor Among Many Job Classes}},
  journal = {Journal of the Association for Computing Machinery},
  year = {1980},
  month = {July},
  volume = {27},
  number = {3},
  pages = {519-532},
  keyword = {scheduling},
  abstract = {A single-server processor-sharing system with M job classes is
  analyzed in the steady state. The scheduling strategy considered divides the
  total processor capacity in unequal fractions among the different job
  classes. More precisely, if there are N/sub j/ jobs of class j in the system,
  j=1,2,..., M, each class k job receives a fraction g/sub k//( Sigma /sub
  j=1//sup M/g/sub j/N/sub j/) of the processor capacity. Earlier analyses of
  this system are shown to be incorrect and new expressions for the conditional
  expected response times W/sub k/(t) of class k jobs with required service
  time t are obtained (for general required service time distributions). These
  yield the asymptotic behavior of W/sub k/(t) as t to infinity and rather
  simple formulas in the exponential case. The unconditional average response
  times are also obtained.}
}

@Article{feitelson90,
  author = {Feitelson,Dror~G. and Rudolph, Larry},
  title = {{Distributed Hierarchical Control for Parallel Processing}},
  journal = {IEEE Computer},
  year = {1990},
  month = {May},
  volume = {23},
  number = {5},
  pages = {65-77},
  keyword = {scheduling, parallel, coscheduling},
  abstract = {A description is given of a novel design, using a hierarchy of
  controllers, that effectively controls a multiuser, multiprogrammed parallel
  system. Such a structure allows dynamic repartitioning according to changing
  job requirements. The design goals are examined, and the principles of
  distributed hierarchical control are presented. Control over processors is
  discussed. Mapping and load balancing with distributed hierarchical control
  are considered. Support for gang scheduling as well as availability and fault
  tolerance is addressed. The use of distributed hierarchical control in memory
  management and I/O is discussed.}
}

@Article{feitelson92,
  author = {Feitelson, Dror G. and Rudolph, Larry},
  title = {{Gang Scheduling Performance Benefits for Fine-Grained
  Synchronization}},
  journal = {Journal of Parallel and Distributed Computing},
  year = {1992},
  month = {December},
  volume = {16},
  number = {4},
  pages = {306-18},
  keyword = {scheduling, parallel, coscheduling},
  abstract = {Multiprogrammed multiprocessors executing fine-grain parallel
  programs appear to require new scheduling policies. A promising new idea is
  gang scheduling, where a set of threads are scheduled to execute
  simultaneously on a set of processors. This has the intuitive appeal of
  supplying the threads with an environment that is very similar to a dedicated
  machine. It allows the threads to interact efficiently by using busy waiting,
  without the risk of waiting for a thread that currently is not running.
  Without gang scheduling, threads have to block in order to synchronize, thus
  suffering the overhead of a context switch. While this is tolerable in
  coarse-grain computations, and might even led to performance benefits if the
  threads are highly unbalanced, it causes severe performance degradation in
  the fine-grain case. The authors have developed a model to evaluate the
  performance of different combinations of synchronization mechanisms and
  scheduling policies, and validated it by an implementation on the Makbilan
  multiprocessor. The model leads to the conclusion that gang scheduling is
  required for efficient fine-grain synchronization on multiprogrammed
  multiprocessors.}
}

@Article{feitelson95:runtime,
  author = {Feitelson,Dror~G. and Rudolph, Larry},
  title = {{Coscheduling Based on Run-Time Identification of Activity Working
  Sets}},
  journal = {International Journal of Parallel Programming},
  year = {1995},
  month = {April},
  volume = {23},
  number = {2},
  pages = {136-160},
  keyword = {scheduling, parallel, coscheduling},
  abstract = {This paper introduces a method for runtime identification of sets
  of interacting activities ("working sets") with the purpose of coscheduling
  them, i.e., scheduling them so that all the activities in the set execute
  simultaneously on distinct processors. The identification is done by
  monitoring access rates to shared communication objects: activities that
  access the same objects at a high rate thereby interact frequently, and
  therefore would benefit from coscheduling. Simulation results show that
  coscheduling with our runtime identification scheme can give better
  performance than uncoordinated scheduling based on a single global activity
  queue. The finer-grained the interactions among the activities in a working
  set, the better the performance differential. Moreover, coscheduling based on
  automatic runtime identification achieves about the same performance as
  coscheduling based on manual identification of working sets by the
  programmer.}
}

@TechReport{feitelson95:survey,
  author = {Feitelson, Dror G.},
  title = {{A Survey of Scheduling in Multiprogrammed Parallel Systems}},
  year = {1995},
  month = {February},
  type = {Research Report RC 19790 (87657)},
  institution = {IBM T.J. Watson Research Center},
  address = {Yorktown Heights, NY},
  keyword = {scheduling, parallel, survey}
}

@InProceedings{ferguson88,
  author = {Ferguson, Donald and Yemini, Yechiam and Nikolaou, Christos},
  title = {{Microeconomic Algorithms for Load Balancing in Distributed Computer
  Systems}},
  booktitle = {International Conference on Distributed Computer Systems},
  year = {1988},
  pages = {491-499},
  keyword = {scheduling, load balancing, distributed, microeconomic},
  abstract = {A novel approach to allocating and sharing communication and
  computational resources in a distributed system is described. The approach,
  which is based on concepts drawn from microeconomics, uses algorithms that
  are competitive rather than cooperative. The effectiveness of these concepts
  is demonstrated by describing an economy that improves the performance of a
  distributed system by implementing load balancing. In this economy,
  competition sets prices for the resources in the system. Jobs complete for
  the resources by issuing bids, and the resource allocation decisions are made
  through auctions held by the processors. The benefits of the method include
  limited complexity and algorithms that are intrinsically decentralized and
  modular. Simulation studies show that these economies achieve substantial
  performance benefits.}
}

@TechReport{fong95,
  author = {Fong, Liana L. and Squillante, Mark S.},
  title = {{Time-Function Scheduling: A General Apporach to Controllable
  Resource Management}},
  year = {1995},
  month = {August},
  number = {RC 20155 (89194)},
  institution = {IBM Research Division, T.J. Watson Research Center},
  address = {Yorktown Heights, NY 10598},
  keyword = {scheduling, proportional-share}
}

@InProceedings{ford96,
  author = {Ford, Bryan and Susarla, Sai},
  title = {{CPU Inheritance Scheduling}},
  booktitle = {Usenix Association Second Symposium on Operating Systems Design
  and Implementation (OSDI)},
  year = {1996},
  pages = {91-105},
  keyword = {scheduling, proportional-share},
  abstract = {Traditional processor scheduling mechanisms in operating systems
  (OSs) are fairly rigid, often supporting only one fixed scheduling policy,
  or, at most, a few "scheduling classes" whose implementations are closely
  tied together in the OS kernel. This paper presents CPU inheritance
  scheduling, a novel processor scheduling framework in which arbitrary threads
  can act as schedulers for other threads. Widely different scheduling policies
  can be implemented under the framework, and many different policies can
  coexist in a single system, providing much greater scheduling flexibility.
  Modular, hierarchical control can be provided over the processor utilization
  of arbitrary administrative domains, such as processes, jobs, users and
  groups, and the CPU resources consumed can be accounted for and attributed
  accurately. Applications, as well as the OS, can implement customized local
  scheduling policies; the framework ensures that all the different policies
  work together logically and predictably. As a side effect, the framework also
  cleanly addresses priority inversion by providing a generalized form of
  priority inheritance that automatically works within and among diverse
  scheduling policies. CPU inheritance scheduling extends naturally to
  multiprocessors, and supports processor management techniques such as
  processor affinity and scheduler activations. We show that this flexibility
  can be provided with acceptable overhead in typical environments, depending
  on factors such as context switch speed and frequency.}
}

@Article{ghosal91,
  author = {Ghosal, Dipak and Serazzi, Giuseppe and Tripathi, Satish K.},
  title = {{The Processor Working Set and Its Use in Scheduling Multiprocessor
  Systems}},
  journal = {IEEE Transactions on Software Engineering},
  year = {1991},
  month = {May},
  volume = {17},
  number = {5},
  pages = {443-453},
  keyword = {scheduling, parallel},
  abstract = {The concept of a processor working set (PWS) as a single value
  parameter for characterizing the parallel program behavior is introduced.
  Through detailed experimental studies of different algorithms on a
  transputer-based multiprocessor machine, it is shown that the PWS is a robust
  measure for characterizing the workload of a multiprocessor system. It is
  shown that processor allocation strategies based on the PWS provide
  significantly better throughput-delay characteristics. The robustness of PWS
  is further demonstrated by showing that allocation policies that allocate
  processors more than the PWS are inferior in performance to those that never
  allocate more than the PWS-even at a moderately low load. Based on the
  results, a simple static allocation policy that allocates the PWS at low load
  and adaptively fragments at high load to one processor per job is proposed.}
}

@InProceedings{goyal96,
  author = {Goyal, Pawan and Guo, Xingang and Vin, Harrick M.},
  title = {{A Hierarchical CPU Scheduler for Multimedia Operating Systems}},
  booktitle = {Usenix Association Second Symposium on Operating Systems Design
  and Implementation (OSDI)},
  year = {1996},
  pages = {107-121},
  keyword = {scheduling, proportional-share},
  abstract = {The need to support a variety of hard and soft real-time systems,
  as well as best-effort applications in a multimedia computing environment
  requires an operating system framework that: (1) enables different schedulers
  to be employed for different application classes, and (2) provides protection
  between the various classes of applications. We argue that these objectives
  can be achieved by hierarchical partitioning of CPU bandwidth, in which an
  operating system partitions the CPU bandwidth among various application
  classes, and each application class, in turn, partitions its allocation
  (potentially using a different scheduling algorithm) among its sub-classes or
  applications. We present the start-time fair queuing (SFQ) algorithm, which
  enables such hierarchical partitioning. We have implemented a hierarchical
  scheduler in Solaris 2.4. We describe our implementation and demonstrate its
  suitability for multimedia operating systems.}
}

@InProceedings{gupta91,
  author = {Gupta, Anoop and Tucker, Andrew and Urushibara, Shigeru},
  title = {{The Impact of Operating System Scheduling Policies and
  Synchronization Methods on the Performance of Parallel Applications}},
  booktitle = {Proceedings of the 1991 ACM SIGMETRICS Conference},
  year = {1991},
  month = {May},
  pages = {120-32},
  keyword = {parallel, scheduling, synchronization},
  abstract = {Shared-memory multiprocessors are frequently used as compute
  servers with multiple parallel applications executing at the same time. In
  such environments, the efficiency of a parallel application can be
  significantly affected by the operating system scheduling policy. The authors
  use detailed simulation studies to evaluate the performance of several
  different scheduling strategies. They also explore tradeoffs between the use
  of busy-waiting and blocking synchronization primitives and their
  interactions with the scheduling strategies. A key focus is on the impact of
  the scheduling strategies on the caching behavior of the applications.}
}

@Article{hellerstein93,
  author = {Hellerstein, Joseph L.},
  title = {{Achieving Service Rate Objectives with Decay Usage Scheduling}},
  journal = {IEEE Transactions on Software Engineering},
  year = {1993},
  month = {August},
  volume = {19},
  number = {8},
  pages = {813-825},
  keyword = {scheduling, time share},
  abstract = {Decay usage scheduling is a priority- and usage-based approach to
  CPU allocation in which preference is given to processes that have consumed
  little CPU in the recent past. The author develops an analytic model for
  decay usage schedulers running compute-bound workloads, such as those found
  in many engineering and scientific environments; the model is validated from
  measurements of a Unix system. This model is used in two ways. First, ways to
  parameterize decay usage schedulers are studied to achieve a wide range of
  service rates. Doing so requires a fine granularity of control and a large
  range of control. The results show that, for a fixed representation of
  process priorities a larger range of control makes the granularity of control
  coarser, and a finer granularity of control decreases the range of control. A
  second use of the analytic model is to construct a low overhead algorithms
  for achieving service rate objectives. Existing approaches require adding a
  feedback loop to the scheduler. This overhead is avoided by exploiting the
  feedback already present in decay usage schedulers. Using both empirical and
  analytical techniques, it is shown that the algorithm is effective and that
  it provides fairness when the system is over- or under-loaded.}
}

@Article{henry84,
  author = {Henry, G. J.},
  title = {{The Fair Share Scheduler}},
  journal = {AT\&T Bell Laboratories Technical Journal},
  year = {1984},
  month = {October},
  volume = {63},
  number = {8},
  pages = {1845-1857},
  keyword = {scheduling, fair},
  abstract = {The Fair Share Scheduler (FSS) is a process scheduling scheme
  within the UNIX operating system that controls the distribution of resources
  to sets of related processes. This control offers features that are useful to
  many applications, including user control of service level, execution
  predictability, fair resource allocation, predictable and fair billing, and
  load insulation between user communities. The author discusses the concepts
  of a fair share scheduler, the motivation for and history behind FSS, some
  practical FSS applications, the user and administrator interfaces to FSS, and
  the design philosophy of FSS.}
}

@InProceedings{karlin91,
  author = {Karlin, A.R. and Li, K. and Manasse, M.S. and Owicki, S.},
  title = {Empirical Studies of Competitive Spinning for a Shared-Memory
  Multiprocessor},
  booktitle = {Thirteenth ACM Symposium on Operating Systems Principles},
  year = {1991},
  month = {October},
  keyword = {parallel, shared-memory, synchronization, scheduling},
  abstract = {A common operation in multiprocessor programs is acquiring a lock
  to protect access to shared data. Typically, the requesting thread is blocked
  if the lock it needs is held by another thread. Alternatively, the thread
  could spin until the lock is free, or spin for a while and then block. The
  authors study seven strategies for determining whether and how long to spin
  before blocking. Of particular interest are competitive strategies, for which
  the performance can be shown to be no worse than some constant factor times
  an optimal off-line strategy. The performance of five competitive strategies
  is compared with that of always blocking, always spinning, or using the
  optimal off-line algorithm. Measurements of lock-waiting time distributions
  for five parallel programs were used to compare the cost of synchronization
  under all the strategies. Additional measurements of elapsed time for some of
  the programs and strategies allowed assessment of the impact of
  synchronization strategy on overall program performance. Both types of
  measurements indicate that the standard blocking strategy performs poorly
  compared to mixed strategies. Among the mixed strategies studied, adaptive
  algorithms perform better than non-adaptive ones.}
}

@Article{kay88,
  author = {Kay, J. and Lauder, P.},
  title = {{A Fair Share Scheduler}},
  journal = {Communications of the ACM},
  year = {1988},
  month = {January},
  volume = {31},
  number = {1},
  pages = {44-55},
  keyword = {scheduling, fair},
  abstract = {Central-processing-unit schedulers have traditionally allocated
  resources fairly among processes. By contrast, a fair Share scheduler
  allocates resources so that users get their fair machine share over a long
  period. To date, Share has been used exclusively to allocate CPU time, though
  it takes into account the consumption of all resources. Share may be
  applicable to the scheduling of resources other than CPU, but for simplicity,
  the article concentrates on CPU scheduling.}
}

@Article{kleinrock67,
  author = {Kleinrock, Leonard},
  title = {{Time-shared Systems: A Theoretical Treatment}},
  journal = {Journal of the Association for Computing Machinery},
  year = {1967},
  month = {April},
  volume = {14},
  number = {2},
  pages = {242-261},
  keyword = {time share, scheduling}
}

@InProceedings{konto93,
  author = {Kontothanassis, Leonidas I. and Wisniewski, Robert W.},
  title = {{Using Scheduler Information to Achieve Optimal Barrier
  Synchronization Performance}},
  booktitle = {Proceedings of the ACM SIGPLAN Symposium on Principles and
  Practice of Parallel Programming},
  year = {1993},
  month = {May},
  keyword = {sychronization, scheduling, parallel},
  abstract = {Parallel programs frequently use barriers to synchronize
  successive steps in an algorithm. In the presence of multiprogramming, the
  choice of spinning versus blocking barriers can have a significant impact on
  performance. The authors demonstrate how competitive spinning techniques
  previously designed for locks can be extended to barriers, and evaluate their
  performance. They design an additional competitive spinning technique that
  adapts more quickly in a dynamic environment. They then propose and evaluate
  a new method that obtains by using scheduler information to decide between
  spinning and blocking. The schedular information technique makes optimal
  choices incurring little overhead.}
}

@InProceedings{lee97,
  author = {Lee, Walter and Frank, Matthew and Lee, Victor and Mackenzi,
  Kenneth and Rudolph, Larry},
  title = {{Implications of I/O for Gang Scheduled Workloads}},
  booktitle = {Proceedings of the IPPS '97 Workshop on Job Scheduling
  Strategies for Parallel Processing},
  year = {1997},
  keyword = {scheduling, I/O, coscheduling, parallel}
}

@InProceedings{leutenegger90,
  author = {Leutenegger, Scott T. and Vernon, Mary K.},
  title = {{The Performance of Multiprogrammed Multiprocessor Scheduling
  Policies}},
  booktitle = {Proceedings of the 1990 ACM SIGMETRICS Conference},
  year = {1990},
  month = {May},
  pages = {226-36},
  keyword = {parallel, scheduling},
  abstract = {Scheduling policies for general purpose multiprogrammed
  multiprocessors are not well understood. This paper examines various policies
  to determine which properties of a scheduling policy are the most significant
  determinants of performance. The authors compare a more comprehensive set of
  policies than previous work, including one important scheduling policy that
  has not previously been examined. They also compare the policies under
  workloads that they feel are more realistic than previous studies have used.
  Using these new workloads, they arrive at different conclusions than reported
  in earlier work. In particular, they find that the 'smallest number of
  processes first' (SNPF) scheduling discipline performs poorly, even when the
  number or processes in a job is positively correlated with the total service
  demand of the job. The authors also find that policies that allocate an equal
  fraction of the processing power to each job in the system perform better, on
  the whole, than policies that allocate processing power unequally. Finally,
  they find that for lock access synchronization, dividing processing power
  equally among all jobs in the system is a more effective property of a
  scheduling policy than the property of minimizing synchronization
  spin-waiting, unless demand for synchronization is extremely high.}
}

@InProceedings{leutenegger93,
  author = {Scott T. Leutenegger and Xian-He Sun},
  title = {{Distributed Computing Feasibility in a Non-Dedicated Homogenous
  Distributed System}},
  booktitle = {Proceedings of Supercomputing '93},
  year = {1993},
  pages = {143--152},
  keyword = {networks of workstations, clusters, parallel, distributed,
  scheduling},
  abstract = {The authors address the feasibility of a nondedicated parallel
  processing environment, assuming workstation processes have preemptive
  priority over parallel tasks. They develop an analytical model to predict
  parallel job response times. The model provides insight into how
  significantly the workstation owner interference degrades parallel program
  performance. A new term, task ratio, which relates the parallel task demand
  to the mean service demand of nonparallel workstation processes, is
  introduced. It is proposed that the task ratio is a useful metric for
  determining how large the demand of a parallel applications must be in order
  to make efficient use of a nondedicated distributed system.}
}

@InProceedings{litzkow88,
  author = {Litzkow, Michael and Livny, Miron and Mutka, Matt},
  title = {{Condor - A Hunter of Idle Workstations}},
  booktitle = {Proceedings of the 8th International Conference of Distributed
  Computing Systems},
  year = {1988},
  month = {June},
  pages = {104-111},
  keyword = {scheduling, networks of workstations, migration, idle, clusters},
  abstract = {The design, implementation, and performance of the Condor
  scheduling system, which operates in a workstation environment, are
  presented. The system aims to maximize the utilization of workstations with
  as little interference as possible between the jobs it schedules and the
  activities of the people who own workstations. It identifies idle
  workstations and schedules background jobs on them. When the owner of a
  workstation resumes activity at a station, Condor checkpoints the remote job
  running on the station and transfers it to another workstation. The system
  guarantees that the job will eventually complete, and that very little, if
  any, work will be performed more than once. A performance profile of the
  system is presented that is based on data accumulated from 23 stations during
  one month.}
}

@InProceedings{lo87,
  author = {Lo, Shau-Ping and Gligor, Virgil D.},
  title = {{A Comparative Analysis of Multiprocessor Scheduling Algorithms}},
  booktitle = {Proceedings of the Seventh International Conference on
  Distributed Computing Systems},
  year = {1987},
  month = {September},
  pages = {356-363},
  keyword = {scheduling, parallel},
  abstract = {Critical path and group algorithms for scheduling multiprocess
  computations on multiprocessor systems are defined. The dependencies of these
  algorithms on the degree and pattern of message interaction are reviewed, and
  the sensitivity of these dependencies on the length of the pause-wait time
  and the swap-in/swap-out time is investigated using simulation experiments.
  The authors explain the notions of the threshold number of processors for
  multiprocess computations. An analytical procedure is developed to compute a
  lower bound of the differences in scheduling overhead for the simulated
  algorithms when the number of processors is large. Thus procedure is used to
  validate some of the results of the simulation experiments.}
}

@InBook{malone88,
  author = {Malone, T. and Fikes, R. and Grant, K. and Howard, M},
  title = {{Enterprise: A market-like task scheduler for distributed computing
  environments}},
  year = {1988},
  pages = {177-205},
  publisher = {North-Holland},
  keyword = {scheduling, microeconomic, distributed}
}

@InProceedings{mccann93,
  author = {McCann, C. and Vaswani, R. and Zahorjan, John},
  title = {{A Dynamic Processor Allocation Policy for Multi-programmed
  Shared-Memory Multiprocessors}},
  booktitle = {ACM Transactions on Computer Systems},
  year = {1993},
  month = {May},
  volume = {11},
  pages = {146-178},
  keyword = {scheduling, parallel},
  abstract = {The authors propose and evaluate empirically the performance of a
  dynamic processor-scheduling policy for multiprogrammed shared-memory
  multiprocessors. The policy is dynamic in that it reallocates processors from
  one parallel job to another based on the currently realized parallelism of
  those jobs. The policy is suitable for implementation in production systems
  in that: it interacts well with very efficient user-level thread packages,
  leaving to them many low-level thread operations that do not require kernel
  intervention; it deals with thread blocking due to user I/O and page faults
  it ensures fairness in delivering resources to jobs; its performance,
  measured in terms of average job response time, is superior to that of
  previously proposed schedules; it provides good performance to very short,
  sequential (e.g., interactive) requests.}
}

@InProceedings{mccann94,
  author = {Cathy McCann and John Zahorjan},
  title = {{Processor Allocation Policies for Message-Passing Parallel
  Computers}},
  booktitle = {Proceedings of the 1994 ACM SIGMETRICS Conference},
  year = {1994},
  month = {February},
  pages = {19--32},
  keyword = {scheduling, parallel},
  abstract = {When multiple jobs compete for processing resources on a parallel
  computer, the operating system kernel's processor allocation policy
  determines how many and which processors to allocate to each. In this paper
  we investigate the issues involved in constructing a processor allocation
  policy for large scale, message-passing parallel computers supporting a
  scientific workload.}
}

@InProceedings{mccann95,
  author = {McCann, Cathy and Zahorjan, John},
  title = {{Scheduling Memory Constrained Jobs on Distributed Memory Parallel
  Computers}},
  booktitle = {Proceedings of ACM SIGMETRICS'95/PERFORMANCE'95 Joint
  International Conference on Measurement and Modeling of Computer Systems},
  year = {1995},
  month = {May},
  pages = {208-219},
  keyword = {scheduling, parallel, memory},
  abstract = {We consider the problem of multiprocessor scheduling of jobs
  whose memory requirements place lower bounds on the fraction of the machine
  required in order to execute. We address three primary questions in this
  work: 1. How can a parallel machine be multiprogrammed with minimal overhead
  when jobs have minimum memory requirements? 2. To what extent does the
  inability of an application to repartition its workload during runtime affect
  the choice of processor allocation policy? 3. How rigid should the system be
  in attempting to provide equal resource allocation to each runnable job in
  order to minimise average response time?. This work is applicable both to
  parallel machines and to networks of workstations supporting parallel
  applications.}
}

@InBook{miller88,
  author = {Miller, Mark and Drexler, K.},
  title = {{Markets and computation: Agoric open systems}},
  year = {1988},
  pages = {133-176},
  publisher = {North-Holland},
  keyword = {scheduling, microeconomic}
}

@InProceedings{mitzen96,
  author = {Mitzenmacher, Michael},
  title = {{Load balancing and density dependent jump Markov processes}},
  booktitle = {Proceedings of 37th Conference on Foundations of Computer
  Science},
  year = {1996},
  month = {October},
  pages = {213-222},
  keyword = {scheduling, load balance, random},
  abstract = {We provide a new approach for analyzing both static and dynamic
  randomized load balancing strategies. We demonstrate the approach by
  providing the first analysis of the following model: customers arrive as a
  Poisson stream of rate lambda /sub n/, lambda <1, at a collection of n
  servers. Each customer chooses some constant d servers independently and
  uniformly at random from the n servers, and waits for service at the one with
  the fewest customers. Customers are served according to the first-in
  first-out (FIFO) protocol, and the service time for a customer is
  exponentially distributed with mean 1. We call this problem the supermarket
  model. We wish to know how the system behaves, and in particular we are
  interested in the expected time a customer spends in the system in
  equilibrium. The model provides a good abstraction of a simple, efficient
  load balancing scheme in the setting where jobs arrive at a large system of
  parallel processors. This model appears more realistic than similar models
  studied previously, in that it is both dynamic and open: that is, customers
  arrive over time, and the number of customers is not fixed.}
}

@InProceedings{mutka87,
  author = {Mutka, Matt and Livny, Miron},
  title = {{Scheduling Remote Processing Capacity In A Workstation-Processor
  Bank Network}},
  booktitle = {Proceedings of the 7th International Conference on Distributed
  Computing Systems},
  year = {1987},
  month = {September},
  pages = {2-9},
  keyword = {idle, clusters, network of workstations, scheduling, fair},
  abstract = {The problem of long-term scheduling of a group of workstations
  and a processor bank is considered. It is assumed that each workstation is
  under the full control of its user, whereas the processors that constitute
  the processor bank are public resources. Therefore, a workstation can be
  allocated for remote processing only if its user does not perform any local
  activity. A long-term scheduling algorithm called the up-down algorithm and a
  set of performance criteria for evaluating these types of scheduling
  algorithms are presented. Using these criteria and traces of usage patterns
  of 13 workstations, the algorithm is evaluated, and its efficiency and
  fairness are demonstrated. The performances of the round-robin and the random
  algorithms are analyzed using the same criteria and workload, and is shown
  that the up-down algorithm outperforms the other two. All of the three
  algorithm provide the same throughput, but the up-down algorithm protects the
  rights of light users when a few heavy users try to monopolize all free
  resources. The two other algorithms do not maintain a steady quality of
  service for light users in the face of an increasing load of heavy users.}
}

@Article{mutka91,
  author = {Mutka, Matt M. and Livny, Miron},
  title = {{The Available Capacity of a Privately Owned Workstation
  Environment}},
  journal = {Performance Evaluation},
  year = {1991},
  month = {July},
  volume = {12},
  number = {4},
  pages = {269-84},
  keyword = {idle, clusters, network of workstations, scheduling},
  abstract = {Powerful workstations interconnected by networks have become
  widely available as sources of computing cycles. Each workstation is
  typically owned by a single user in order to provide a high quality of
  service for the owner. In most cases, an owner does not have computing
  demands as large as the capacity of the workstation. Therefore, most of the
  workstations are underutilized. Nevertheless, some users have demands that
  exceed the capacities of their workstations. In order to effectively share
  the capacity of workstations, there must be algorithms that allocate
  available capacity and long periods when owners do not use their stations. To
  understand the profile of station availability, the authors analyzed the
  usage patterns of a cluster of workstations. A stochastic model was developed
  which was based on an analysis of the relative frequency distribution and the
  correlation of available and non-available interval lengths. This stochastic
  model is important as a workload generator for the performance evaluation of
  capacity sharing strategies of a cluster of workstations.}
}

@InProceedings{naik93,
  author = {Vijay K. Naik and Sanjeev K. Setia and Mark S. Squillante},
  title = {{Performance Analysis of Job Scheduling Policies in Parallel
  Supercomputing Environments}},
  booktitle = {Proceedings of Supercomputing '93},
  year = {1993},
  month = {November},
  pages = {824--833},
  keyword = {parallel, scheduling},
  abstract = {The authors analyze three general classes of scheduling policies
  under a workload typical of large-scale scientific computing. These policies
  differ in the manner in which processors are partitioned among the jobs as
  well as the way in which jobs are prioritized for execution on the
  partitions. The results indicate that existing static schemes to not perform
  well under varying workloads. Adaptive policies tend to make better
  scheduling decisions, but their ability to adjust to workload changes is
  limited. Dynamic partitioning policies, on the other hand, yield the best
  performance and can be tuned to provide desired performance differences among
  jobs with varying resource demands.}
}

@Conference{ousterhout82,
  author = {Ousterhout, John K.},
  title = {{Scheduling Techniques for Concurrent Systems}},
  booktitle = {Third International Conference on Distributed Computing
  Systems},
  year = {1982},
  month = {May},
  pages = {22-30},
  keyword = {scheduling, coscheduling, parallel},
  abstract = {Current operating systems base many of their decisions on the
  assumption that processes are independent. This paper examines what happens
  in multiprocessor systems that use interprocess communication extensively.
  Traditional techniques for short-term scheduling result in serious
  limitations on interprocessor communication; a two-phase blocking scheme is
  suggested as a solution to the problem. Long-term scheduling policies that
  assume process independence result in a form of thrashing when there are
  groups of cooperating processes. The notion of coscheduling is introduced,
  and three algorithms are described for achieving it. Simulation results
  suggest that substantial degrees of coscheduling can be achieved over a
  variety of conditions using relatively simple methods.}
}

@InProceedings{peris94,
  author = {Vinod G.J. Peris and Mark S. Squillante and Vijay K. Naik},
  title = {{Analysis of the Impact of Memory in Distributed Parallel Processing
  Systems}},
  booktitle = {Proceedings of the 1994 ACM SIGMETRICS Conference},
  year = {1994},
  month = {February},
  pages = {5--18},
  keyword = {scheduling, parallel},
  abstract = {We consider an important tradeoff between processor and memory
  allocation in distributed parallel processing systems. To study this
  tradeoff, we formulate stochastic models of parallel program behavior,
  distributed parallel processing environments and memory overheads incurred by
  parallel programs as a function of their processor allocation. A mathematical
  analysis of the models is developed, which includes the effects of contention
  for shared resources caused by paging activity. We conduct a detailed
  analysis of real large-scale scientific applications and use these results to
  parameterize our models. Our results show that memory overhead resulting from
  processor allocation decisions can have a significant effect on system
  performance in distributed parallel environments, strongly suggesting that
  memory considerations must be incorporated in the resource allocation
  policies for parallel systems. We also demonstrate the importance of the
  inter-locality miss ratio, which is introduced in this paper and analyzed for
  the first time.}
}

@InProceedings{sevcik89,
  author = {Kenneth C. Sevcik},
  title = {{Characterizations of Parallelism in Applications and their Use in
  Scheduling}},
  booktitle = {Proceedings of the 1989 ACM SIGMETRICS and PERFORMANCE
  Conference on Measurement and Modeling of Computer Systems},
  year = {1989},
  month = {May},
  pages = {171--180},
  keyword = {scheduling, parallel},
  abstract = {Examines the quality of processor allocation decisions under
  multiprogramming that can be made with several different high-level
  characterizations of application parallelism. The author demonstrates that
  decisions based on parallelism characterizations with two to four parameters
  are superior to those based on single-parameter characterizations (such as
  fraction sequential or average parallelism). The results are based
  predominantly on simulation, with some guidance from a simple analytic
  model.}
}

@Article{shoch82,
  author = {John F. Shoch and Jon A. Hupp},
  title = {{The Worm Programs: Early Experience with a Distributed
  Computation}},
  journal = {Communications of the ACM},
  year = {1982},
  month = {March},
  volume = {25},
  number = {3},
  keyword = {scheduling, distributed, economic}
}

@Article{smith80,
  author = {Smith, Reid},
  title = {{The Contract Net Protocol: High-Level Communication and Control in
  a Distributed Problem Solver}},
  journal = {IEEE Transactions on Computers},
  year = {1980},
  month = {December},
  volume = {C-29},
  number = {12},
  pages = {1104-1113},
  keyword = {distributed, economic, scheduling},
  abstract = {The contract net protocol has been developed to specify
  problem-solving communication and control for nodes in a distributed problem
  solver. Task distribution is affected by a negotiation process, a discussion
  carried on between nodes with tasks to be executed and nodes that may be able
  to execute those tasks. The author presents the specification of the protocol
  and demonstrates its use in the solution of a problem in distributed sensing.
  The utility of negotiation as an interaction mechanism is discussed. It can
  be used to achieve different goals, such as distributing control and data to
  avoid bottlenecks and enabling a finer degree of control in making resource
  allocation and focus decisions than is possible with traditional mechanisms.}
}

@InProceedings{sobalvarro95,
  author = {Sobalvarro, Patrick G. and Weihl, William E.},
  title = {{Demand-based Coscheduling of Parallel Jobs on Multiprogrammed
  Multiprocessors}},
  booktitle = {Proceedings of the IPPS '95 Workshop on Job Scheduling
  Strategies for Parallel Processing},
  year = {1995},
  month = {April},
  pages = {63-75},
  keyword = {parallel, scheduling, coscheduling},
  abstract = {We present demand-based coscheduling, a new approach to
  scheduling parallel computations on multiprogrammed multiprocessors. In
  demand-based coscheduling, rather than making the pessimistic assumption that
  all the processes constituting a parallel job must be simultaneously
  scheduled in order to achieve good performance, we use information about
  which processes are communicating in order to coschedule only these; the
  result is more opportunities for coscheduling and fewer preemptions than in
  more traditional coscheduling schemes. We introduce two particular types of
  demand-based coscheduling. The first is dynamic coscheduling, which was
  conceived for use on message-passing architectures. We present an analytical
  model and a simulation of dynamic coscheduling that show that the algorithm
  can achieve good performance under pessimistic assumptions. The second is
  predictive coscheduling, for which we present an algorithm that detects
  communication by using virtual memory system information on a bus-based
  shared-memory multiprocessor.}
}

@Unpublished{sobalvarro97,
  author = {Sobalvarro, Patrick G. and Pakin, Scott and Weihl, William E. and
  Chien, Andrew A},
  title = {{Dynamic Coscheduling on Workstation Clusters}},
  year = {1997},
  howpublished = {http://www.research.digital.com/SRC/scheduling},
  URL = {http://www.research.digital.com/SRC/scheduling},
  keyword = {parallel, scheduling, coscheduling}
}

@InProceedings{stoica95,
  author = {Stoica, Ion and Abdel-Wahab, Hussein and Pothen, Alex},
  title = {{A Microeconomic Scheduler for Parallel Computers}},
  booktitle = {Proceedings of the IPPS '95 Workshop on Job Scheduling
  Strategies for Parallel Processing},
  year = {1995},
  month = {April},
  pages = {122-135},
  keyword = {parallel, scheduling, economic},
  abstract = {We describe a scheduler based on the microeconomic paradigm for
  scheduling on-line a set of parallel jobs in a multiprocessor system. In
  addition to increasing the system throughput and reducing the response time,
  we consider fairness in allocating system resources among the users, and
  provide the user with control over the relative performances of his jobs.
  Every user has a savings account in which he receives money at a constant
  rate. To run a job, the user creates an expense account for that job to which
  he transfers money from his savings account. The job uses the funds in its
  expense account to obtain the system resources it needs. The share of the
  system resources allocated to the user is directly related to the rate at
  which the user receives money; the rate at which the user transfers money
  into a job expense account controls the job's performance. We prove that
  starvation is not possible in our model. Simulation results show that our
  scheduler improves both system and user performances in comparison with two
  different variable partitioning policies. It is also effective in
  guaranteeing fairness and providing control over the performance of jobs.}
}

@InProceedings{stoica96,
  author = {Stoica, Ion and Abdel-Wahab, Hussein and Jeffay, Kevin and Baruah,
  Sanjoy and Gehrke, Johannes and Plaxton, C. Greg},
  title = {{A Proportional Share Resource Allocation Algorithm for Real-Time,
  Time-Shared Systems}},
  booktitle = {IEEE Real-Time Systems Symposium},
  year = {1996},
  month = {December},
  keyword = {scheduling, proportional-share},
  abstract = {We propose and analyze a proportional share resource allocation
  algorithm for realizing real-time performance in time-shared operating
  systems. Processes are assigned a weight which determines a share
  (percentage) of the resource they are to receive. The resource is then
  allocated in discrete-sized time quanta in such a manner that each process
  makes progress at a precise, uniform rate. Proportional share allocation
  algorithms are of interest because: they provide a natural means of
  seamlessly integrating real and non-real-time processing; they are easy to
  implement; they provide a simple and effective means of precisely controlling
  the real-time performance of a process; and they provide a natural means of
  policing so that processes that use more of a resource than they request have
  no ill-effect on well-behaved processes. We analyze our algorithm in the
  context of an idealized system in which a resource is assumed to be granted
  in arbitrarily small intervals of time and show that our algorithm guarantees
  that the difference between the service time that a process should receive
  and the service time it actually receives is optimally bounded by the size of
  a time quantum. In addition, the algorithm provides support for dynamic
  operations, such as processes joining or leaving the competition, and for
  both fractional and non-uniform time quanta. As a proof of concept we have
  implemented a prototype of a CPU scheduler under FreeBSD. The experimental
  results shows that our implementation performs within the theoretical bounds
  and hence supports real-time execution in a general purpose operating
  system.}
}

@Article{tucker89,
  author = {Tucker, Andy and Gupta, Anoop},
  title = {Process Control and Scheduling Issues for Multiprogrammed
  Shared-Memory Multiprocessors},
  journal = {Operating Systems Review},
  year = {1989},
  volume = {23},
  number = {5},
  pages = {159-66},
  keyword = {parallel, scheduling, shared memory},
  abstract = {Shared-memory multiprocessors are frequently used in a
  time-sharing style with multiple parallel applications executing at the same
  time. In such an environment, where the machine load is continuously varying,
  the question arises of how an application should maximize its performance
  while being fair to other users of the system. They first show that if the
  number of runnable processes belonging to a parallel application
  significantly exceeds the effective number of physical processors executing
  it, its performance can be significantly degraded. They can propose a way of
  controlling the number of runnable processes associated with an application
  dynamically, to ensure good performance. The optimal number of runnable
  processes for each application is determined by a centralized server, and
  applications dynamically suspend or resume processes in order to match that
  number. A preliminary implementation of the proposed scheme is now running on
  the Encore Multimax and they show how it helps improve the performance of
  several applications. In some cases the improvement is more than a factor of
  two. They also discuss implications of the proposed scheme for multiprocessor
  schedulers, and how the scheme should interface with parallel programming
  languages.}
}

@Article{waldspurger92,
  author = {Waldspurger, Carl and Hogg, Tad and Huberman, Bernado and Kephart,
  Jeffrey and Stornetta, Scott},
  title = {{Spawn: A Distributed Computational Economy}},
  journal = {IEEE Transactions on Software Engineering},
  year = {1992},
  month = {February},
  volume = {18},
  number = {2},
  pages = {103-117},
  keyword = {scheduling, economic, parallel},
  abstract = {The authors have designed and implemented an open, market-based
  computational system called Spawn. The Spawn system utilizes idle
  computational resources in a distributed network of heterogeneous computer
  workstations. It supports both coarse-grain concurrent applications and the
  remote execution of many independent tasks. Using concurrent Monte Carlo
  simulations as prototypical applications, the authors explore issues of
  fairness in resource distribution, currency as a form of priority, price
  equilibria, the dynamics of transients, and scaling to large systems. In
  addition to serving the practical goal of harnessing idle processor time in a
  computer network, Spawn has proven to be a valuable experimental workbench
  for studying computational markets and their dynamics.}
}

@InProceedings{waldspurger95:lottery,
  author = {Waldspurger, Carl A. and Weihl, William E.},
  title = {{Lottery Scheduling: Flexible Proportional-Share Resource
  Management}},
  booktitle = {First Symposium on Operating Systems Design and Implementation
  (OSDI)},
  year = {1995},
  pages = {1-11},
  organization = {USENIX Association},
  keyword = {scheduling, proportional-share},
  abstract = {This paper presents lottery scheduling, a novel randomized
  resource allocation mechanism. Lottery scheduling provides efficient,
  responsive control over the relative execution rates of computations. Such
  control is beyond the capabilities of conventional schedulers, and is
  desirable in systems that service requests of varying importance, such as
  databases, media-based applications, and networks. Lottery scheduling also
  supports modular resource management by enabling concurrent modules to
  insulate their resource allocation policies from one another. A currency
  abstraction is introduced to flexibly name, share, and protect resource
  rights. We also show that lottery scheduling can be generalized to manage
  many diverse resources, such as I/O bandwidth, memory, and access to locks.
  We have implemented a prototype lottery scheduler for the Mach 3.0
  microkernel, and found that it provides flexible and responsive control over
  the relative execution rates of a wide range of applications. The overhead
  imposed by our unoptimized prototype is comparable to that of the standard
  Mach timesharing policy.}
}

@PhdThesis{waldspurger95:phdthesis,
  author = {Waldspurger, Carl A.},
  title = {{Lottery and Stride Scheduling: Flexible Proportional-Share Resource
  Management}},
  year = {1995},
  month = {September},
  school = {Massachusetts Institute of Technology},
  note = {Also appears as Technical Report MIT/LCS/TR-667},
  keyword = {scheduling, proportional-share}
}

@TechReport{waldspurger95:stride,
  author = {Waldspurger, Carl A. and Weihl, William E.},
  title = {{Stride Scheduling: Deterministic Proportional-Share Resource
  Mangement}},
  year = {1995},
  month = {June},
  number = {MIT/LCS/TM-528},
  institution = {Massachusetts Institute of Technology},
  address = {MIT Laboratory for Computer Science},
  keyword = {scheduling, proportional-share}
}


