1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 package edu.rit.pj.cluster;
41
42 import java.io.FileOutputStream;
43 import java.io.IOException;
44 import java.io.PrintStream;
45 import java.io.PrintWriter;
46 import java.net.InetSocketAddress;
47 import java.util.Date;
48 import java.util.HashMap;
49 import java.util.Iterator;
50 import java.util.LinkedList;
51 import java.util.List;
52 import java.util.Map;
53
54 import edu.rit.http.HttpRequest;
55 import edu.rit.http.HttpResponse;
56 import edu.rit.http.HttpServer;
57 import edu.rit.mp.Channel;
58 import edu.rit.mp.ChannelGroup;
59 import edu.rit.mp.ChannelGroupClosedException;
60 import edu.rit.mp.ConnectListener;
61 import edu.rit.mp.ObjectBuf;
62 import edu.rit.mp.Status;
63 import edu.rit.mp.buf.ObjectItemBuf;
64 import edu.rit.pj.Version;
65 import edu.rit.util.Logger;
66 import edu.rit.util.PrintStreamLogger;
67 import edu.rit.util.Timer;
68 import edu.rit.util.TimerTask;
69 import edu.rit.util.TimerThread;
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87 public class JobScheduler
88 implements JobSchedulerRef {
89
90
91
92 private String myClusterName;
93
94
95 private Logger myLog;
96
97
98 private String myWebHost;
99 private int myWebPort;
100
101
102 private String mySchedulerHost;
103 private int mySchedulerPort;
104
105
106 private String myFrontendHost;
107
108
109 private int myJobTime;
110
111
112 private Map<String, BackendInfo> myNameToBackendMap
113 = new HashMap<String, BackendInfo>();
114
115
116 private BackendInfo[] myBackendInfo;
117 private int myBackendCount;
118
119
120 private int myNextBackendNumber = 0;
121
122
123 private int myNextJobNumber = 1;
124
125
126 private Map<JobFrontendRef, JobInfo> myFrontendToJobMap
127 = new HashMap<JobFrontendRef, JobInfo>();
128
129
130 private List<JobInfo> myRunningJobList
131 = new LinkedList<JobInfo>();
132
133
134 private List<JobInfo> myWaitingJobList
135 = new LinkedList<JobInfo>();
136
137
138 private TimerThread myLeaseTimerThread;
139
140
141 private ChannelGroup myChannelGroup;
142
143
144 private HttpServer myHttpServer;
145
146
147 private long myTotalComputeTime;
148
149
150 private long myStartDateTime;
151
152
153
154
155
156
157
158
159
160 private JobScheduler(String configfile)
161 throws IOException {
162 long now = System.currentTimeMillis();
163 myStartDateTime = now;
164
165
166 Configuration config = new Configuration(configfile);
167 myClusterName = config.getClusterName();
168 myLog
169 = new PrintStreamLogger(new PrintStream(new FileOutputStream(config.getLogFile(), true),
170 true));
171 myWebHost = config.getWebHost();
172 myWebPort = config.getWebPort();
173 mySchedulerHost = config.getSchedulerHost();
174 mySchedulerPort = config.getSchedulerPort();
175 myFrontendHost = config.getFrontendHost();
176 myJobTime = config.getJobTime();
177 myBackendCount = config.getBackendCount();
178 myBackendInfo = new BackendInfo[myBackendCount];
179 for (int i = 0; i < myBackendCount; ++i) {
180 BackendInfo backendinfo = config.getBackendInfo(i);
181 myNameToBackendMap.put(backendinfo.name, backendinfo);
182 myBackendInfo[i] = backendinfo;
183 }
184
185
186 myLog.log(now, "Started " + Version.PJ_VERSION);
187
188
189 Runtime.getRuntime().addShutdownHook(new Thread() {
190 public void run() {
191 shutdown();
192 }
193 });
194
195
196 myLeaseTimerThread = new TimerThread();
197 myLeaseTimerThread.setDaemon(true);
198 myLeaseTimerThread.start();
199
200
201 myChannelGroup
202 = new ChannelGroup(new InetSocketAddress(mySchedulerHost, mySchedulerPort),
203 myLog);
204 myLog.log(now, "Job Scheduler at " + myChannelGroup.listenAddress());
205 myChannelGroup.setConnectListener(new ConnectListener() {
206 public void nearEndConnected(ChannelGroup theChannelGroup,
207 Channel theChannel) {
208 }
209
210 public void farEndConnected(ChannelGroup theChannelGroup,
211 Channel theChannel) {
212 createJob(theChannel);
213 }
214 });
215
216
217 myHttpServer
218 = new HttpServer(new InetSocketAddress(myWebHost, myWebPort), myLog) {
219 protected void process(HttpRequest request,
220 HttpResponse response)
221 throws IOException {
222 processHttpRequest(request, response);
223 }
224 };
225 myLog.log(now, "Web interface at " + myHttpServer.getAddress());
226
227
228 for (BackendInfo backend : myBackendInfo) {
229 myLog.log(now,
230 "Backend " + backend.name + " at " + backend.host
231 + ", " + backend.totalCpus
232 + " CPU" + (backend.totalCpus == 1 ? "" : "s"));
233 }
234
235
236 myChannelGroup.startListening();
237 }
238
239
240
241
242
243
244
245 private synchronized void createJob(Channel theChannel) {
246
247 JobFrontendRef frontend
248 = new JobFrontendProxy(myChannelGroup, theChannel);
249 theChannel.info(frontend);
250
251
252 JobInfo jobinfo = getJobInfo(frontend);
253
254
255 jobinfo.renewTimer.start(Constants.LEASE_RENEW_INTERVAL,
256 Constants.LEASE_RENEW_INTERVAL);
257 jobinfo.expireTimer.start(Constants.LEASE_EXPIRE_INTERVAL);
258 }
259
260
261
262
263 private void run() {
264 ObjectItemBuf<JobSchedulerMessage> buf
265 = ObjectBuf.buffer((JobSchedulerMessage) null);
266 Status status = null;
267 JobSchedulerMessage message = null;
268 JobFrontendRef frontend = null;
269
270 receiveloop:
271 for (;;) {
272
273 try {
274 status = myChannelGroup.receive(null, null, buf);
275 } catch (ChannelGroupClosedException exc) {
276
277 break receiveloop;
278 } catch (Throwable exc) {
279 myLog.log("Exception while receiving message", exc);
280 break receiveloop;
281 }
282 message = buf.item;
283
284
285 frontend = (JobFrontendRef) status.channel.info();
286
287
288 try {
289 message.invoke(this, frontend);
290 } catch (Throwable exc) {
291 myLog.log("Exception while processing message", exc);
292 }
293
294
295
296 buf.item = null;
297 status = null;
298 message = null;
299 frontend = null;
300 }
301 }
302
303
304
305
306
307
308
309
310 public synchronized void backendFailed(JobFrontendRef theJobFrontend,
311 String name)
312 throws IOException {
313 BackendInfo backendinfo = myNameToBackendMap.get(name);
314 if (backendinfo != null) {
315 long now = System.currentTimeMillis();
316 myLog.log(now, "Backend " + name + " failed");
317
318
319
320
321
322
323
324
325 }
326 }
327
328
329
330
331
332
333
334 public synchronized void cancelJob(JobFrontendRef theJobFrontend,
335 String errmsg)
336 throws IOException {
337 JobInfo jobinfo = getJobInfo(theJobFrontend);
338 doCancelJob(System.currentTimeMillis(), jobinfo, errmsg);
339 }
340
341
342
343
344
345
346
347 public synchronized void jobFinished(JobFrontendRef theJobFrontend)
348 throws IOException {
349 JobInfo jobinfo = getJobInfo(theJobFrontend);
350 doFinishJob(System.currentTimeMillis(), jobinfo);
351 }
352
353
354
355
356
357
358
359 public synchronized void renewLease(JobFrontendRef theJobFrontend)
360 throws IOException {
361 JobInfo jobinfo = getJobInfo(theJobFrontend);
362 jobinfo.expireTimer.start(Constants.LEASE_EXPIRE_INTERVAL);
363 }
364
365
366
367
368
369
370 public synchronized void reportComment(JobFrontendRef theJobFrontend,
371 int rank,
372 String comment) {
373 JobInfo jobinfo = getJobInfo(theJobFrontend);
374 jobinfo.comment[rank] = comment;
375 }
376
377
378
379
380
381
382
383 public synchronized void requestJob(JobFrontendRef theJobFrontend,
384 String username,
385 int Nn,
386 int Np,
387 int Nt)
388 throws IOException {
389 JobInfo jobinfo = getJobInfo(theJobFrontend);
390 long now = System.currentTimeMillis();
391 myLog.log(now,
392 "Job " + jobinfo.jobnum + " queued, username=" + username
393 + ", nn=" + Nn + ", np=" + Np + ", nt=" + Nt);
394
395
396 jobinfo.username = username;
397 jobinfo.Nn = Math.min(Nn, Np);
398 jobinfo.Np = Np;
399 jobinfo.Nt = Nt;
400 jobinfo.backend = new BackendInfo[Np];
401 jobinfo.cpus = new int[Np];
402 jobinfo.comment = new String[Np];
403 for (int i = 0; i < Np; ++i) {
404 jobinfo.comment[i] = "";
405 }
406
407
408 if (!enoughResourcesForJob(jobinfo.Nn, jobinfo.Np, jobinfo.Nt)) {
409 doCancelJobTooFewResources(now, jobinfo);
410 return;
411 }
412
413
414 myWaitingJobList.add(jobinfo);
415
416
417 theJobFrontend.assignJobNumber(this, jobinfo.jobnum, myFrontendHost);
418
419
420 assignResourcesToJobs(now);
421 }
422
423
424
425
426 public void close() {
427 }
428
429
430
431
432
433
434
435
436
437 private synchronized void renewTimeout(Timer theTimer,
438 JobFrontendRef theJobFrontend)
439 throws IOException {
440 if (theTimer.isTriggered()) {
441 theJobFrontend.renewLease(this);
442 }
443 }
444
445
446
447
448
449
450
451
452 private synchronized void expireTimeout(Timer theTimer,
453 JobFrontendRef theJobFrontend)
454 throws IOException {
455 if (theTimer.isTriggered()) {
456 JobInfo jobinfo = getJobInfo(theJobFrontend);
457 doCancelJob(System.currentTimeMillis(),
458 jobinfo,
459 "Job frontend lease expired");
460 }
461 }
462
463
464
465
466
467
468
469
470 private synchronized void jobTimeout(Timer theTimer,
471 JobFrontendRef theJobFrontend)
472 throws IOException {
473 if (theTimer.isTriggered()) {
474 JobInfo jobinfo = getJobInfo(theJobFrontend);
475 String errmsg
476 = "Maximum job time (" + myJobTime + " seconds) exceeded";
477 jobinfo.frontend.cancelJob(this, errmsg);
478 doCancelJob(System.currentTimeMillis(), jobinfo, errmsg);
479 }
480 }
481
482
483
484
485
486
487
488
489 private JobInfo getJobInfo(JobFrontendRef frontend) {
490 final JobFrontendRef fe = frontend;
491 JobInfo jobinfo = myFrontendToJobMap.get(frontend);
492 if (jobinfo == null) {
493 jobinfo = new JobInfo(myNextJobNumber++,
494 JobInfo.State.WAITING,
495 System.currentTimeMillis(),
496 null,
497 0,
498 0,
499 0,
500 0,
501 null,
502 null,
503 0,
504 fe,
505
506 myLeaseTimerThread.createTimer(new TimerTask() {
507 public void action(Timer theTimer) {
508 try {
509 renewTimeout(theTimer, fe);
510 } catch (Throwable exc) {
511 myLog.log(exc);
512 }
513 }
514 }),
515
516 myLeaseTimerThread.createTimer(new TimerTask() {
517 public void action(Timer theTimer) {
518 try {
519 expireTimeout(theTimer, fe);
520 } catch (Throwable exc) {
521 myLog.log(exc);
522 }
523 }
524 }),
525
526 myLeaseTimerThread.createTimer(new TimerTask() {
527 public void action(Timer theTimer) {
528 try {
529 jobTimeout(theTimer, fe);
530 } catch (Throwable exc) {
531 myLog.log(exc);
532 }
533 }
534 }));
535 myFrontendToJobMap.put(frontend, jobinfo);
536 }
537 return jobinfo;
538 }
539
540
541
542
543
544
545
546
547
548 private void doFinishJob(long now,
549 JobInfo jobinfo)
550 throws IOException {
551 myLog.log(now, "Job " + jobinfo.jobnum + " finished");
552 doCleanupJob(now, jobinfo);
553 }
554
555
556
557
558
559
560
561
562
563
564 private void doCancelJob(long now,
565 JobInfo jobinfo,
566 String errmsg)
567 throws IOException {
568 myLog.log(now, "Job " + jobinfo.jobnum + " canceled: " + errmsg);
569 doCleanupJob(now, jobinfo);
570 }
571
572
573
574
575
576
577
578
579
580 private void doCancelJobTooFewResources(long now,
581 JobInfo jobinfo)
582 throws IOException {
583 String errmsg;
584 if (jobinfo.Nt == 0) {
585 errmsg
586 = "Too few resources available to assign "
587 + jobinfo.Nn + " node" + (jobinfo.Nn == 1 ? "" : "s") + " and "
588 + jobinfo.Np + " process" + (jobinfo.Np == 1 ? "" : "es");
589 } else {
590 errmsg
591 = "Too few resources available to assign "
592 + jobinfo.Nn + " node" + (jobinfo.Nn == 1 ? "" : "s") + ", "
593 + jobinfo.Np + " process" + (jobinfo.Np == 1 ? "" : "es") + ", and "
594 + jobinfo.Nt + " CPU" + (jobinfo.Nt == 1 ? "" : "s") + " per process";
595 }
596 jobinfo.frontend.cancelJob(this, errmsg);
597 doCancelJob(now, jobinfo, errmsg);
598 }
599
600
601
602
603
604
605
606
607
608 private void doCleanupJob(long now,
609 JobInfo jobinfo)
610 throws IOException {
611
612 jobinfo.renewTimer.stop();
613 jobinfo.expireTimer.stop();
614 jobinfo.jobTimer.stop();
615
616
617 jobinfo.frontend.close();
618
619
620 myFrontendToJobMap.remove(jobinfo.frontend);
621 myRunningJobList.remove(jobinfo);
622 myWaitingJobList.remove(jobinfo);
623
624
625 for (int i = 0; i < jobinfo.count; ++i) {
626 BackendInfo backendinfo = jobinfo.backend[i];
627 if (backendinfo.state != BackendInfo.State.FAILED) {
628 backendinfo.state = BackendInfo.State.IDLE;
629 backendinfo.stateTime = now;
630 backendinfo.job = null;
631 }
632 }
633
634
635 myTotalComputeTime += (now - jobinfo.stateTime);
636
637
638 assignResourcesToJobs(now);
639 }
640
641
642
643
644
645
646
647
648 private void assignResourcesToJobs(long now)
649 throws IOException {
650
651 List<JobInfo> cancelList = new LinkedList<JobInfo>();
652
653
654 Iterator<JobInfo> iter = myWaitingJobList.iterator();
655 jobLoop:
656 while (iter.hasNext()) {
657 JobInfo jobinfo = iter.next();
658
659
660
661 if (!enoughResourcesForJob(jobinfo.Nn, jobinfo.Np, jobinfo.Nt)) {
662 iter.remove();
663 cancelList.add(jobinfo);
664 continue jobLoop;
665 }
666
667
668 int Np_div_Nn = jobinfo.Np / jobinfo.Nn;
669 int Np_rem_Nn = jobinfo.Np % jobinfo.Nn;
670
671
672
673 int be = myNextBackendNumber;
674 do {
675
676 int Nproc = Np_div_Nn;
677 if (jobinfo.nodeCount < Np_rem_Nn) {
678 ++Nproc;
679 }
680
681
682 BackendInfo backendinfo = myBackendInfo[be];
683 if (backendinfo.state == BackendInfo.State.IDLE
684 && backendinfo.totalCpus >= Nproc) {
685
686 backendinfo.state = BackendInfo.State.RESERVED;
687 backendinfo.stateTime = now;
688 backendinfo.job = jobinfo;
689
690
691 int Nt_div_Nproc = backendinfo.totalCpus / Nproc;
692 int Nt_rem_Nproc = backendinfo.totalCpus % Nproc;
693
694
695 for (int i = 0; i < Nproc; ++i) {
696
697 int Ncpus = jobinfo.Nt;
698 if (Ncpus == 0) {
699 Ncpus = Nt_div_Nproc;
700 if (i < Nt_rem_Nproc) {
701 ++Ncpus;
702 }
703 }
704
705
706 myLog.log(now,
707 "Job " + jobinfo.jobnum + " assigned "
708 + backendinfo.name + ", rank=" + jobinfo.count
709 + ", CPUs=" + Ncpus);
710
711
712 jobinfo.backend[jobinfo.count] = backendinfo;
713 jobinfo.cpus[jobinfo.count] = Ncpus;
714 ++jobinfo.count;
715
716
717 jobinfo.frontend.assignBackend(this,
718 backendinfo.name,
719 backendinfo.host,
720 backendinfo.jvm,
721 backendinfo.classpath,
722 backendinfo.jvmflags,
723 backendinfo.shellCommand,
724 Ncpus);
725 }
726
727
728 ++jobinfo.nodeCount;
729 }
730
731
732 be = (be + 1) % myBackendCount;
733 } while (be != myNextBackendNumber && jobinfo.count < jobinfo.Np);
734 myNextBackendNumber = be;
735
736
737 if (jobinfo.count == jobinfo.Np) {
738
739 myLog.log(now, "Job " + jobinfo.jobnum + " started");
740
741
742 iter.remove();
743 myRunningJobList.add(jobinfo);
744 jobinfo.state = JobInfo.State.RUNNING;
745 jobinfo.stateTime = now;
746
747
748 for (BackendInfo backendinfo : jobinfo.backend) {
749 backendinfo.state = BackendInfo.State.RUNNING;
750 backendinfo.stateTime = now;
751 }
752
753
754
755 if (myJobTime > 0) {
756 jobinfo.jobTimer.start(myJobTime * 1000L);
757 }
758 }
759
760 else {
761 break jobLoop;
762 }
763 }
764
765
766 for (JobInfo jobinfo : cancelList) {
767 doCancelJobTooFewResources(now, jobinfo);
768 }
769 }
770
771
772
773
774
775
776
777
778
779
780 private boolean enoughResourcesForJob(int Nn,
781 int Np,
782 int Nt) {
783
784 int Ppn = (Np + Nn - 1) / Nn;
785
786
787
788 if (Nt == 0) {
789 Nt = 1;
790 }
791
792
793 int nodeCount = 0;
794 for (BackendInfo backendinfo : myBackendInfo) {
795
796 if (backendinfo.state != BackendInfo.State.FAILED
797 &&
798 backendinfo.totalCpus >= Ppn * Nt) {
799
800 ++nodeCount;
801 }
802 }
803
804
805 return nodeCount >= Nn;
806 }
807
808
809
810
811
812
813
814
815
816 private void processHttpRequest(HttpRequest request,
817 HttpResponse response)
818 throws IOException {
819 long now = System.currentTimeMillis();
820
821
822 if (!request.isValid()) {
823 response.setStatusCode(HttpResponse.Status.STATUS_400_BAD_REQUEST);
824 PrintWriter out = response.getPrintWriter();
825 printStatusHtmlStart(out, now);
826 out.println("<P>");
827 out.println("400 Bad Request");
828 printStatusHtmlEnd(out);
829 }
830 else if (!request.getMethod().equals(HttpRequest.GET_METHOD)) {
831 response.setStatusCode(HttpResponse.Status.STATUS_501_NOT_IMPLEMENTED);
832 PrintWriter out = response.getPrintWriter();
833 printStatusHtmlStart(out, now);
834 out.println("<P>");
835 out.println("501 Not Implemented");
836 printStatusHtmlEnd(out);
837 }
838 else if (request.getUri().equals("/")
839 || request.getUri().equals("/?")) {
840 PrintWriter out = response.getPrintWriter();
841 printStatusHtmlStart(out, now);
842 printStatusHtmlBody(out, now);
843 printStatusHtmlEnd(out);
844 }
845 else if (request.getUri().equals("/debug")) {
846 PrintWriter out = response.getPrintWriter();
847 printDebugHtmlStart(out, now);
848 printDebugHtmlBody(out);
849 printStatusHtmlEnd(out);
850 }
851 else if (request.getUri().startsWith("/job/")) {
852 String jobString = request.getUri().substring(5);
853 try {
854 int jobNum = Integer.parseInt(jobString);
855 PrintWriter out = response.getPrintWriter();
856 printJobDetailHtmlStart(out, now, jobNum);
857 printJobDetailHtmlBody(out, now, jobNum);
858 printStatusHtmlEnd(out);
859 } catch (NumberFormatException exc) {
860 PrintWriter out = response.getPrintWriter();
861 printErrorHtmlStart(out);
862 out.printf("<P>Invalid job number \"%s\"</P>\n", jobString);
863 printErrorHtmlEnd(out);
864 }
865 }
866 else {
867 response.setStatusCode(HttpResponse.Status.STATUS_404_NOT_FOUND);
868 PrintWriter out = response.getPrintWriter();
869 printErrorHtmlStart(out);
870 out.println("<P>404 Not Found</P>");
871 printErrorHtmlEnd(out);
872 }
873
874
875 response.close();
876 }
877
878
879
880
881
882
883
884 private void printStatusHtmlStart(PrintWriter out,
885 long now) {
886 out.println("<HTML>");
887 out.println("<HEAD>");
888 out.print("<TITLE>");
889 out.print(myClusterName);
890 out.println("</TITLE>");
891 out.print("<META HTTP-EQUIV=\"refresh\" CONTENT=\"20;url=");
892 printWebInterfaceURL(out);
893 out.println("\">");
894 out.println("<STYLE TYPE=\"text/css\">");
895 out.println("<!--");
896 out.println("* {font-family: Arial, Helvetica, Sans-Serif;}");
897 out.println("body {font-size: small;}");
898 out.println("h1 {font-size: 140%; font-weight: bold;}");
899 out.println("table {font-size: 100%;}");
900 out.println("-->");
901 out.println("</STYLE>");
902 out.println("</HEAD>");
903 out.println("<BODY>");
904 out.print("<H1>");
905 out.print(myClusterName);
906 out.println("</H1>");
907 out.println("<P>");
908 out.print("<FORM ACTION=\"");
909 printWebInterfaceURL(out);
910 out.println("\" METHOD=\"get\">");
911 out.println("<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>");
912 out.println("<TR>");
913 out.print("<TD ALIGN=\"left\" VALIGN=\"center\">");
914 out.print("<INPUT TYPE=\"submit\" VALUE=\"Refresh\">");
915 out.println("</TD>");
916 out.println("<TD WIDTH=20> </TD>");
917 out.print("<TD ALIGN=\"left\" VALIGN=\"center\">");
918 out.print(new Date(now));
919 out.print(" -- ");
920 out.print(Version.PJ_VERSION);
921 out.println("</TD>");
922 out.println("</TR>");
923 out.println("</TABLE>");
924 out.println("</FORM>");
925 }
926
927
928
929
930
931
932
933 private synchronized void printStatusHtmlBody(PrintWriter out,
934 long now) {
935 out.println("<P>");
936 out.println("<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>");
937 out.println("<TR>");
938 out.println("<TD ALIGN=\"center\" VALIGN=\"top\">");
939
940 out.println("Nodes");
941 out.println("<TABLE BORDER=1 CELLPADDING=3 CELLSPACING=0>");
942 out.println("<TR>");
943 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
944
945 out.println("<TABLE BORDER=0 CELLPADDING=3 CELLSPACING=0>");
946 printBackendLabels(out);
947 int i = 0;
948 for (BackendInfo backend : myBackendInfo) {
949 printBackendInfo(out, now, backend, i);
950 ++i;
951 }
952 out.println("</TABLE>");
953
954 out.println("</TD>");
955 out.println("</TR>");
956 out.println("</TABLE>");
957
958 out.println("</TD>");
959 out.println("<TD WIDTH=40> </TD>");
960 out.println("<TD ALIGN=\"center\" VALIGN=\"top\">");
961
962 out.println("Jobs");
963 out.println("<TABLE BORDER=1 CELLPADDING=3 CELLSPACING=0>");
964 out.println("<TR>");
965 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
966
967 out.println("<TABLE BORDER=0 CELLPADDING=3 CELLSPACING=0>");
968 printJobLabels(out);
969 i = 0;
970 for (JobInfo job : myRunningJobList) {
971 printJobInfo(out, now, job, i);
972 ++i;
973 }
974 for (JobInfo job : myWaitingJobList) {
975 printJobInfo(out, now, job, i);
976 ++i;
977 }
978 out.println("</TABLE>");
979
980 out.println("</TD>");
981 out.println("</TR>");
982 out.println("</TABLE>");
983
984 printTotalComputeTime(out);
985 out.print("<BR>");
986 printJobCount(out);
987 out.println("<BR>Since " + new Date(myStartDateTime));
988
989 out.println("</TD>");
990 out.println("</TR>");
991 out.println("</TABLE>");
992 }
993
994
995
996
997
998
999 private void printJobCount(PrintWriter out) {
1000 if (myNextJobNumber == 2) {
1001 out.print("1 job");
1002 } else {
1003 out.print(myNextJobNumber - 1);
1004 out.print(" jobs");
1005 }
1006 out.println(" served");
1007 }
1008
1009
1010
1011
1012
1013
1014 private void printTotalComputeTime(PrintWriter out) {
1015 if (myTotalComputeTime < 1000000L) {
1016 out.print(myTotalComputeTime / 1000L);
1017 } else if (myTotalComputeTime < 1000000000L) {
1018 out.print("Over ");
1019 out.print(myTotalComputeTime / 1000000L);
1020 out.print(" thousand");
1021 } else if (myTotalComputeTime < 1000000000000L) {
1022 out.print("Over ");
1023 out.print(myTotalComputeTime / 1000000000L);
1024 out.print(" million");
1025 } else if (myTotalComputeTime < 1000000000000000L) {
1026 out.print("Over ");
1027 out.print(myTotalComputeTime / 1000000000000L);
1028 out.print(" billion");
1029 } else {
1030 out.print("Over ");
1031 out.print(myTotalComputeTime / 1000000000000000L);
1032 out.print(" trillion");
1033 }
1034 out.println(" CPU seconds served");
1035 }
1036
1037
1038
1039
1040
1041
1042 private void printStatusHtmlEnd(PrintWriter out) {
1043 out.println("<P>");
1044 out.println("<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>");
1045 out.println("<TR>");
1046 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1047 out.println("Job queue web interface: ");
1048 out.println("</TD>");
1049 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1050 out.print("<A HREF=\"");
1051 printWebInterfaceURL(out);
1052 out.print("\">");
1053 printWebInterfaceURL(out);
1054 out.println("</A>");
1055 out.println("</TD>");
1056 out.println("</TR>");
1057 out.println("<TR>");
1058 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1059 out.println("Powered by Parallel Java: ");
1060 out.println("</TD>");
1061 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1062 out.println("<A HREF=\"http://www.cs.rit.edu/~ark/pj.shtml\">http://www.cs.rit.edu/~ark/pj.shtml</A>");
1063 out.println("</TD>");
1064 out.println("</TR>");
1065 out.println("<TR>");
1066 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1067 out.println("Developed by Alan Kaminsky: ");
1068 out.println("</TD>");
1069 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1070 out.println("<A HREF=\"http://www.cs.rit.edu/~ark/\">http://www.cs.rit.edu/~ark/</A>");
1071 out.println("</TD>");
1072 out.println("</TR>");
1073 out.println("</TABLE>");
1074 out.println("</BODY>");
1075 out.println("</HTML>");
1076 }
1077
1078
1079
1080
1081
1082
1083 private void printWebInterfaceURL(PrintWriter out) {
1084 out.printf("http://%s:%d/", myWebHost, myWebPort);
1085 }
1086
1087
1088
1089
1090
1091
1092
1093 private void printJobNumberURL(PrintWriter out,
1094 int jobNum) {
1095 out.printf("http://%s:%d/job/%d", myWebHost, myWebPort, jobNum);
1096 }
1097
1098
1099
1100
1101
1102
1103
1104 private void printJobNumberLink(PrintWriter out,
1105 int jobNum) {
1106 out.printf("<A HREF=\"http://%s:%d/job/%d\"> %d </A>",
1107 myWebHost, myWebPort, jobNum, jobNum);
1108 }
1109
1110
1111
1112
1113
1114
1115
1116
1117 private void printDeltaTime(PrintWriter out,
1118 long now,
1119 long then) {
1120 out.print((now - then + 500L) / 1000L);
1121 out.print(" sec");
1122 }
1123
1124
1125
1126
1127
1128
1129 private void printBackendLabels(PrintWriter out) {
1130 out.println("<TR BGCOLOR=\"#E8E8E8\">");
1131 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1132 out.print("<I>Node</I>");
1133 out.println("</TD>");
1134 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1135 out.print("<I>CPUs</I>");
1136 out.println("</TD>");
1137 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1138 out.print("<I>Status</I>");
1139 out.println("</TD>");
1140 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1141 out.print("<I>Job</I>");
1142 out.println("</TD>");
1143 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1144 out.print("<I>Time</I>");
1145 out.println("</TD>");
1146 out.println("</TR>");
1147 }
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157 private void printBackendInfo(PrintWriter out,
1158 long now,
1159 BackendInfo backend,
1160 int i) {
1161 out.print("<TR BGCOLOR=\"#");
1162 out.print(i % 2 == 0 ? "FFFFFF" : "E8E8E8");
1163 out.println("\">");
1164 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1165 out.print(backend.name);
1166 out.println("</TD>");
1167 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1168 out.print(backend.totalCpus);
1169 out.println("</TD>");
1170 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1171 if (backend.state == BackendInfo.State.FAILED) {
1172 out.print("<FONT COLOR=\"#FF0000\"><B>");
1173 out.print(backend.state);
1174 out.print("</B></FONT>");
1175 } else {
1176 out.print(backend.state);
1177 }
1178 out.println("</TD>");
1179 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1180 if (backend.job != null) {
1181 printJobNumberLink(out, backend.job.jobnum);
1182 } else {
1183 out.print(" ");
1184 }
1185 out.println("</TD>");
1186 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1187 if (backend.job != null) {
1188 printDeltaTime(out, now, backend.job.stateTime);
1189 } else {
1190 out.print(" ");
1191 }
1192 out.println("</TD>");
1193 out.println("</TR>");
1194 }
1195
1196
1197
1198
1199
1200
1201 private void printJobLabels(PrintWriter out) {
1202 out.println("<TR BGCOLOR=\"#E8E8E8\">");
1203 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1204 out.print("<I>Job</I>");
1205 out.println("</TD>");
1206 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1207 out.print("<I>User</I>");
1208 out.println("</TD>");
1209 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1210 out.print("<I>nn</I>");
1211 out.println("</TD>");
1212 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1213 out.print("<I>np</I>");
1214 out.println("</TD>");
1215 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1216 out.print("<I>nt</I>");
1217 out.println("</TD>");
1218 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1219 out.print("<I>Rank</I>");
1220 out.println("</TD>");
1221 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1222 out.print("<I>Node</I>");
1223 out.println("</TD>");
1224 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1225 out.print("<I>CPUs</I>");
1226 out.println("</TD>");
1227 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1228 out.print("<I>Status</I>");
1229 out.println("</TD>");
1230 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1231 out.print("<I>Time</I>");
1232 out.println("</TD>");
1233 out.println("</TR>");
1234 }
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244 private void printJobInfo(PrintWriter out,
1245 long now,
1246 JobInfo job,
1247 int i) {
1248 boolean first;
1249 out.print("<TR BGCOLOR=\"#");
1250 out.print(i % 2 == 0 ? "FFFFFF" : "E8E8E8");
1251 out.println("\">");
1252 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1253 printJobNumberLink(out, job.jobnum);
1254 out.println("</TD>");
1255 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1256 out.print(job.username);
1257 out.println("</TD>");
1258 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1259 out.print(job.Nn);
1260 out.println("</TD>");
1261 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1262 out.print(job.Np);
1263 out.println("</TD>");
1264 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1265 out.print(job.Nt == 0 ? "all" : "" + job.Nt);
1266 out.println("</TD>");
1267 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1268 if (job.count == 0) {
1269 out.print(" ");
1270 } else {
1271 for (int j = 0; j < job.count; ++j) {
1272 if (j > 0) {
1273 out.print("<BR>");
1274 }
1275 out.print(j);
1276 }
1277 }
1278 out.println("</TD>");
1279 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1280 if (job.count == 0) {
1281 out.print(" ");
1282 } else {
1283 for (int j = 0; j < job.count; ++j) {
1284 if (j > 0) {
1285 out.print("<BR>");
1286 }
1287 out.print(job.backend[j].name);
1288 }
1289 }
1290 out.println("</TD>");
1291 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1292 if (job.count == 0) {
1293 out.print(" ");
1294 } else {
1295 for (int j = 0; j < job.count; ++j) {
1296 if (j > 0) {
1297 out.print("<BR>");
1298 }
1299 out.print(job.cpus[j]);
1300 }
1301 }
1302 out.println("</TD>");
1303 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1304 out.print(job.state);
1305 out.println("</TD>");
1306 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1307 printDeltaTime(out, now, job.stateTime);
1308 out.println("</TD>");
1309 out.println("</TR>");
1310 }
1311
1312
1313
1314
1315
1316
1317 private void printDebugHtmlStart(PrintWriter out,
1318 long now) {
1319 out.println("<HTML>");
1320 out.println("<HEAD>");
1321 out.print("<TITLE>");
1322 out.print(myClusterName);
1323 out.println("</TITLE>");
1324 out.println("<STYLE TYPE=\"text/css\">");
1325 out.println("<!--");
1326 out.println("* {font-family: Arial, Helvetica, Sans-Serif;}");
1327 out.println("body {font-size: small;}");
1328 out.println("h1 {font-size: 140%; font-weight: bold;}");
1329 out.println("table {font-size: 100%;}");
1330 out.println("-->");
1331 out.println("</STYLE>");
1332 out.println("</HEAD>");
1333 out.println("<BODY>");
1334 out.print("<H1>");
1335 out.print(myClusterName);
1336 out.println("</H1>");
1337 out.println("<P>");
1338 out.print(new Date(now));
1339 out.print(" -- ");
1340 out.print(Version.PJ_VERSION);
1341 out.println("</P>");
1342 }
1343
1344
1345
1346
1347
1348
1349 private void printDebugHtmlBody(PrintWriter out) {
1350 out.println("<P>");
1351 out.println("<HR/>");
1352 out.println("<H3>Thread Dump</H3>");
1353 out.println("</P>");
1354 Map<Thread, StackTraceElement[]> traces = Thread.getAllStackTraces();
1355 for (Map.Entry<Thread, StackTraceElement[]> entry : traces.entrySet()) {
1356 Thread thread = entry.getKey();
1357 out.println("<P>");
1358 out.print("Name: ");
1359 out.print(thread.getName());
1360
1361
1362
1363 out.println(" ");
1364 out.print(" Daemon: ");
1365 out.print(thread.isDaemon() ? "yes" : "no");
1366 out.println(" ");
1367 out.print(" State: ");
1368 out.print(thread.getState());
1369 out.println(" ");
1370 out.print(" Priority: ");
1371 out.print(thread.getPriority());
1372 out.println(" ");
1373 out.print(" Thread Group: ");
1374 out.print(thread.getThreadGroup().getName());
1375 out.println();
1376 for (StackTraceElement element : entry.getValue()) {
1377 out.print("<BR/> ");
1378 out.println(element);
1379 }
1380 out.println("</P>");
1381 }
1382 out.println("<P>");
1383 out.println("<HR/>");
1384 out.println("</P>");
1385 }
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395 private void printJobDetailHtmlStart(PrintWriter out,
1396 long now,
1397 int jobNum) {
1398 out.println("<HTML>");
1399 out.println("<HEAD>");
1400 out.print("<TITLE>");
1401 out.print(myClusterName);
1402 out.println("</TITLE>");
1403 out.print("<META HTTP-EQUIV=\"refresh\" CONTENT=\"20;url=");
1404 printJobNumberURL(out, jobNum);
1405 out.println("\">");
1406 out.println("<STYLE TYPE=\"text/css\">");
1407 out.println("<!--");
1408 out.println("* {font-family: Arial, Helvetica, Sans-Serif;}");
1409 out.println("body {font-size: small;}");
1410 out.println("h1 {font-size: 140%; font-weight: bold;}");
1411 out.println("table {font-size: 100%;}");
1412 out.println("-->");
1413 out.println("</STYLE>");
1414 out.println("</HEAD>");
1415 out.println("<BODY>");
1416 out.print("<H1>");
1417 out.print(myClusterName);
1418 out.println("</H1>");
1419 out.println("<P>");
1420 out.print("<FORM ACTION=\"");
1421 printJobNumberURL(out, jobNum);
1422 out.println("\" METHOD=\"get\">");
1423 out.println("<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>");
1424 out.println("<TR>");
1425 out.print("<TD ALIGN=\"left\" VALIGN=\"center\">");
1426 out.print("<INPUT TYPE=\"submit\" VALUE=\"Refresh\">");
1427 out.println("</TD>");
1428 out.println("<TD WIDTH=20> </TD>");
1429 out.print("<TD ALIGN=\"left\" VALIGN=\"center\">");
1430 out.print(new Date(now));
1431 out.print(" -- ");
1432 out.print(Version.PJ_VERSION);
1433 out.println("</TD>");
1434 out.println("</TR>");
1435 out.println("</TABLE>");
1436 out.println("</FORM>");
1437 }
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447 private synchronized void printJobDetailHtmlBody(PrintWriter out,
1448 long now,
1449 int jobNum) {
1450 JobInfo jobInfo = null;
1451
1452
1453 for (JobInfo job : myRunningJobList) {
1454 if (job.jobnum == jobNum) {
1455 jobInfo = job;
1456 break;
1457 }
1458 }
1459 if (jobInfo == null) {
1460 for (JobInfo job : myWaitingJobList) {
1461 if (job.jobnum == jobNum) {
1462 jobInfo = job;
1463 break;
1464 }
1465 }
1466 }
1467
1468 out.println("<P>");
1469 out.println("<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>");
1470 out.println("<TR>");
1471 out.println("<TD ALIGN=\"left\" VALIGN=\"top\"><B>Job:</B></TD>");
1472 out.println("<TD WIDTH=10> </TD>");
1473 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\"><B>%d</B></TD>",
1474 jobNum);
1475 out.println("</TR>");
1476 out.println("<TR>");
1477 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">User:</TD>");
1478 out.println("<TD WIDTH=10> </TD>");
1479 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s</TD>",
1480 jobInfo == null ? " " : jobInfo.username);
1481 out.println("</TR>");
1482 out.println("<TR>");
1483 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">Nodes (nn):</TD>");
1484 out.println("<TD WIDTH=10> </TD>");
1485 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s</TD>",
1486 jobInfo == null ? " " : "" + jobInfo.Nn);
1487 out.println("</TR>");
1488 out.println("<TR>");
1489 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">Processes (np):</TD>");
1490 out.println("<TD WIDTH=10> </TD>");
1491 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s</TD>",
1492 jobInfo == null ? " " : "" + jobInfo.Np);
1493 out.println("</TR>");
1494 out.println("<TR>");
1495 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">Threads (nt):</TD>");
1496 out.println("<TD WIDTH=10> </TD>");
1497 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s</TD>",
1498 jobInfo == null ? " " : jobInfo.Nt == 0 ? "All" : "" + jobInfo.Nt);
1499 out.println("</TR>");
1500 out.println("<TR>");
1501 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">Status:</TD>");
1502 out.println("<TD WIDTH=10> </TD>");
1503 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s</TD>",
1504 jobInfo == null ? "Not in queue" : jobInfo.state);
1505 out.println("</TR>");
1506 out.println("<TR>");
1507 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">Time:</TD>");
1508 out.println("<TD WIDTH=10> </TD>");
1509 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1510 if (jobInfo == null) {
1511 out.print(" ");
1512 } else {
1513 printDeltaTime(out, now, jobInfo.stateTime);
1514 }
1515 out.println("</TD>");
1516 out.println("</TR>");
1517 out.println("</TABLE>");
1518 out.println("</P>");
1519
1520 if (jobInfo == null || jobInfo.count == 0) {
1521 return;
1522 }
1523
1524 out.println("<P>");
1525 out.println("<TABLE BORDER=0 CELLPADDING=0 CELLSPACING=0>");
1526 out.println("<TR>");
1527 out.println("<TD ALIGN=\"center\" VALIGN=\"top\">");
1528
1529 out.println("Processes");
1530 out.println("<TABLE BORDER=1 CELLPADDING=3 CELLSPACING=0>");
1531 out.println("<TR>");
1532 out.println("<TD ALIGN=\"left\" VALIGN=\"top\">");
1533
1534 out.println("<TABLE BORDER=0 CELLPADDING=3 CELLSPACING=0>");
1535 printJobDetailProcessLabels(out);
1536 for (int i = 0; i < jobInfo.count; ++i) {
1537 printJobDetailProcessInfo(out, jobInfo, i);
1538 }
1539 out.println("</TABLE>");
1540
1541 out.println("</TD>");
1542 out.println("</TR>");
1543 out.println("</TABLE>");
1544
1545 out.println("</TD>");
1546 out.println("</TR>");
1547 out.println("</TABLE>");
1548 out.println("</P>");
1549 }
1550
1551
1552
1553
1554
1555
1556 private void printJobDetailProcessLabels(PrintWriter out) {
1557 out.println("<TR BGCOLOR=\"#E8E8E8\">");
1558 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1559 out.print("<I>Rank</I>");
1560 out.println("</TD>");
1561 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1562 out.print("<I>Node</I>");
1563 out.println("</TD>");
1564 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1565 out.print("<I>CPUs</I>");
1566 out.println("</TD>");
1567 out.print("<TD ALIGN=\"left\" VALIGN=\"top\">");
1568 out.print("<I>Comment</I>");
1569 out.println("</TD>");
1570 out.println("</TR>");
1571 }
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581 private void printJobDetailProcessInfo(PrintWriter out,
1582 JobInfo jobInfo,
1583 int rank) {
1584 out.printf("<TR BGCOLOR=\"#%s\">\n",
1585 rank % 2 == 0 ? "FFFFFF" : "E8E8E8");
1586 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%d </TD>\n",
1587 rank);
1588 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s </TD>\n",
1589 jobInfo.backend[rank].name);
1590 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%d </TD>\n",
1591 jobInfo.cpus[rank]);
1592 out.printf("<TD ALIGN=\"left\" VALIGN=\"top\">%s</TD>\n",
1593 jobInfo.comment[rank]);
1594 out.println("</TR>");
1595 }
1596
1597
1598
1599
1600
1601
1602 private void printErrorHtmlStart(PrintWriter out) {
1603 out.println("<HTML>");
1604 out.println("<HEAD>");
1605 out.print("<TITLE>");
1606 out.print(myClusterName);
1607 out.println("</TITLE>");
1608 out.println("<STYLE TYPE=\"text/css\">");
1609 out.println("<!--");
1610 out.println("* {font-family: Arial, Helvetica, Sans-Serif;}");
1611 out.println("body {font-size: small;}");
1612 out.println("h1 {font-size: 140%; font-weight: bold;}");
1613 out.println("table {font-size: 100%;}");
1614 out.println("-->");
1615 out.println("</STYLE>");
1616 out.println("</HEAD>");
1617 out.println("<BODY>");
1618 }
1619
1620
1621
1622
1623
1624
1625 private void printErrorHtmlEnd(PrintWriter out) {
1626 out.println("</BODY>");
1627 out.println("</HTML>");
1628 }
1629
1630
1631
1632
1633 private void shutdown() {
1634 if (myChannelGroup != null) {
1635 myChannelGroup.close();
1636 }
1637 if (myHttpServer != null) {
1638 try {
1639 myHttpServer.close();
1640 } catch (IOException ignored) {
1641 }
1642 }
1643 myLog.log("Stopped");
1644 }
1645
1646
1647
1648
1649
1650
1651
1652
1653 public static void main(String[] args)
1654 throws Exception {
1655 if (args.length != 1) {
1656 System.err.println("Usage: java edu.rit.pj.cluster.JobScheduler <configfile>");
1657 System.exit(1);
1658 }
1659
1660 JobScheduler scheduler = new JobScheduler(args[0]);
1661 scheduler.run();
1662 }
1663
1664 }