1 //******************************************************************************
2 //
3 // File: Runner.java
4 // Package: edu.rit.pj.job
5 // Unit: Class edu.rit.pj.job.Runner
6 //
7 // This Java source file is copyright (C) 2010 by Alan Kaminsky. All rights
8 // reserved. For further information, contact the author, Alan Kaminsky, at
9 // ark@cs.rit.edu.
10 //
11 // This Java source file is part of the Parallel Java Library ("PJ"). PJ is free
12 // software; you can redistribute it and/or modify it under the terms of the GNU
13 // General Public License as published by the Free Software Foundation; either
14 // version 3 of the License, or (at your option) any later version.
15 //
16 // PJ is distributed in the hope that it will be useful, but WITHOUT ANY
17 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
18 // A PARTICULAR PURPOSE. See the GNU General Public License for more details.
19 //
20 // Linking this library statically or dynamically with other modules is making a
21 // combined work based on this library. Thus, the terms and conditions of the GNU
22 // General Public License cover the whole combination.
23 //
24 // As a special exception, the copyright holders of this library give you
25 // permission to link this library with independent modules to produce an
26 // executable, regardless of the license terms of these independent modules, and
27 // to copy and distribute the resulting executable under terms of your choice,
28 // provided that you also meet, for each linked independent module, the terms
29 // and conditions of the license of that module. An independent module is a module
30 // which is not derived from or based on this library. If you modify this library,
31 // you may extend this exception to your version of the library, but you are not
32 // obligated to do so. If you do not wish to do so, delete this exception
33 // statement from your version.
34 //
35 // A copy of the GNU General Public License is provided in the file gpl.txt. You
36 // may also obtain a copy of the GNU General Public License on the World Wide
37 // Web at http://www.gnu.org/licenses/gpl.html.
38 //
39 //******************************************************************************
40 package edu.rit.pj.job;
41
42 import java.io.File;
43 import java.io.IOException;
44 import java.io.PrintStream;
45 import java.util.Date;
46 import java.util.HashSet;
47 import java.util.Scanner;
48
49 import edu.rit.mp.BooleanBuf;
50 import edu.rit.mp.buf.BooleanItemBuf;
51 import edu.rit.pj.Comm;
52 import edu.rit.pj.WorkerIteration;
53 import edu.rit.pj.WorkerRegion;
54 import edu.rit.pj.WorkerTeam;
55 import edu.rit.util.Instance;
56
57 /**
58 * Class Runner is a parallel program that runs, in parallel, a group of
59 * {@linkplain Job}s created by a {@linkplain JobGenerator}. The job generator
60 * is specified on the command line as a constructor expression. An instance of
61 * the class specified in the constructor expression is constructed, with the
62 * constructor arguments specified in the constructor expression. For further
63 * information, see class {@linkplain edu.rit.util.Instance Instance}.
64 * <P>
65 * The Runner program is targeted at three use cases:
66 * <UL>
67 *
68 * <LI>
69 * <B>Sequential jobs on a cluster parallel computer.</B> Each job is a
70 * sequential (single-threaded) program. The Runner program is running on a
71 * cluster parallel computer with <I>N</I> nodes and one CPU per node. Run the
72 * Runner program as follows:
73 * <PRE>
74 * java -Dpj.nn=<I>N</I> edu.rit.pj.job.Runner . . .
75 * </PRE> The Runner program runs with one process per node and one thread per
76 * process.
77 *
78 * <LI>
79 * <B>Sequential jobs on a hybrid parallel computer.</B> Each job is a
80 * sequential (single-threaded) program. The Runner program is running on a
81 * hybrid SMP cluster parallel computer with <I>N</I> nodes and <I>C</I> total
82 * CPUs. (For example, on a hybrid parallel computer with 10 nodes and 4 CPUs
83 * per node, <I>C</I> = 40.) Run the Runner program as follows:
84 * <PRE>
85 * java -Dpj.nn=<I>N</I> -Dpj.np=<I>C</I> edu.rit.pj.job.Runner . . .
86 * </PRE> The Runner program runs with multiple processes per node and one
87 * thread per process.
88 *
89 * <LI>
90 * <B>SMP parallel jobs on a hybrid parallel computer.</B> Each job is an SMP
91 * parallel (multi-threaded) program. The Runner program is running on a hybrid
92 * SMP cluster parallel computer with <I>N</I> nodes and multiple CPUs per node.
93 * Run the Runner program as follows:
94 * <PRE>
95 * java -Dpj.nn=<I>N</I> edu.rit.pj.job.Runner . . .
96 * </PRE> The Runner program runs with one process per node and multiple threads
97 * per process, typically as many threads as there are CPUs on the node.
98 * </UL>
99 * <P>
100 * All these processes form a <I>worker team.</I> The Runner program uses the
101 * job generator specified on the command line to create jobs and sends each job
102 * to a worker team process to be executed.
103 * <P>
104 * When the Runner program starts, it prints the job generator constructor
105 * expression on the standard output. Whenever a job starts or finishes, the
106 * Runner program prints a log message on the standard output consisting of the
107 * job's number and description.
108 * <P>
109 * <B>Checkpointing.</B> It is recommended to redirect the Runner program's
110 * standard output into a <I>checkpoint file.</I> If a failure occurs before the
111 * Runner program finishes running all the jobs, the checkpoint file contains a
112 * record of the job generator that was used as well as which jobs did and did
113 * not finish. To resume the Runner program where it left off, specify the
114 * checkpoint file name on the command line instead of a job generator
115 * constructor expression. The Runner program reads the checkpoint file to
116 * determine the job generator and the jobs that finished. The Runner program
117 * then generates and runs the jobs that did not finish.
118 * <P>
119 * Usage: java edu.rit.pj.job.Runner { <I>generator</I> | <I>file</I> }
120 * <BR><I>generator</I> = Job generator constructor expression
121 * <BR><I>file</I> = Checkpoint file name
122 *
123 * @author Alan Kaminsky
124 * @version 22-Oct-2010
125 */
126 public class Runner {
127
128 // Prevent construction.
129 private Runner() {
130 }
131
132 // Global variables.
133 private static PrintStream stdout = System.out;
134 private static PrintStream stderr = System.err;
135
136 private static Comm world;
137 private static int rank;
138
139 private static WorkerTeam team;
140
141 private static String generatorExpression;
142 private static HashSet<Integer> omitted;
143 private static JobGenerator generator;
144
145 // Main program.
146 /**
147 * Main program.
148 *
149 * @param args an array of {@link java.lang.String} objects.
150 * @throws java.lang.Exception if any.
151 */
152 public static void main(String[] args)
153 throws Exception {
154 if (args.length != 1) {
155 usage();
156 }
157
158 // Initialize world communicator.
159 Comm.init(args);
160 world = Comm.world();
161 rank = world.rank();
162
163 // Set up worker team.
164 team = new WorkerTeam();
165
166 // Master process sets up job generator.
167 if (rank == team.masterRank()) {
168 omitted = new HashSet<Integer>();
169
170 // Assume argument is a checkpoint file name and try to read it.
171 Scanner scanner = null;
172 try {
173 scanner = new Scanner(new File(args[0]));
174 } catch (IOException ignored) {
175 }
176
177 // Read checkpoint file.
178 if (scanner != null) {
179 while (scanner.hasNextLine()) {
180 Scanner linescanner = new Scanner(scanner.nextLine());
181 String word;
182 int jobnum;
183 if (!linescanner.hasNext()) {
184 continue;
185 }
186 word = linescanner.next();
187 if (!word.equals("***")) {
188 continue;
189 }
190 if (!linescanner.hasNext()) {
191 continue;
192 }
193 word = linescanner.next();
194 if (word.equals("Generator")) {
195 if (!linescanner.hasNext()) {
196 continue;
197 }
198 if (generatorExpression == null) {
199 generatorExpression = linescanner.next();
200 }
201 } else if (word.equals("Job")) {
202 if (!linescanner.hasNextInt()) {
203 continue;
204 }
205 jobnum = linescanner.nextInt();
206 if (!linescanner.hasNext()) {
207 continue;
208 }
209 word = linescanner.next();
210 if (word.equals("finished")) {
211 omitted.add(jobnum);
212 }
213 }
214 }
215 scanner.close();
216 } // Assume argument is a job generator constructor expression.
217 else {
218 generatorExpression = args[0];
219 }
220
221 // Create job generator.
222 if (generatorExpression == null) {
223 stderr.printf("Runner: No job generator in checkpoint file %s%n",
224 args[0]);
225 } else {
226 try {
227 generator = (JobGenerator) Instance.newInstance(generatorExpression);
228 stdout.printf("*** Generator %s%n", generatorExpression);
229 generator.omit(omitted);
230 } catch (Throwable exc) {
231 stderr.printf("Runner: Could not create job generator %s%n",
232 generatorExpression);
233 exc.printStackTrace(stderr);
234 }
235 }
236 }
237
238 // Abort every process if job generator was not created.
239 BooleanItemBuf buf = BooleanBuf.buffer(generator != null);
240 world.broadcast(team.masterRank(), buf);
241 if (!buf.item) {
242 System.exit(1);
243 }
244
245 // Generate and run jobs.
246 team.execute(new WorkerRegion() {
247 public void run() throws Exception {
248 execute(generator, new WorkerIteration<Job>() {
249 public void sendTaskInput(Job job, Comm comm, int wRank, int tag) {
250 stdout.printf("*** Job %d started %s %s%n",
251 job.getJobNumber(),
252 new Date(),
253 job.getDescription());
254 }
255
256 public void run(Job job) {
257 job.run();
258 }
259
260 public void receiveTaskOutput(Job job, Comm comm, int wRank, int tag) {
261 stdout.printf("*** Job %d finished %s%n",
262 job.getJobNumber(),
263 new Date());
264 }
265 });
266 }
267 });
268
269 if (rank == team.masterRank()) {
270 stdout.printf("*** All jobs finished%n");
271 }
272 }
273
274 // Hidden operations.
275 /**
276 * Print a usage message and exit.
277 */
278 private static void usage() {
279 stderr.println("Usage: java edu.rit.pj.job.Runner {<generator>|<file>}");
280 stderr.println("<generator> = Job generator constructor expression");
281 stderr.println("<file> = Checkpoint file name");
282 System.exit(1);
283 }
284
285 }