Fixing the problem of ts clients being killed. The queue should be reusable
now.
--- a/buglist.bug Sun Nov 09 12:53:35 2008 +0100
+++ b/buglist.bug Sun Nov 09 22:25:49 2008 +0100
@@ -1,7 +1,7 @@
-24
+25
0 1 sleep recv() may return ECONNRESET \nI don't know why. It happened sometimes, when the client didn't send any\nENDJOB message.\nIt should be investigated.\n\nHypothesis:\n- maybe recv() returns ECONNRESET if the other side process has died but not\n closed()?\n simple tests seem to show that it's not true.
18 1 open Making a different TS_SAVELIST name on each socket/date
-19 10 open Killing a child ts blocked the queue ts creates the job in a new Process Group. You can send signals\nto the whole Process Group using a negative number for kill, as\nin: "kill -- -`ts -p`" (the first double hyphen is for sending\nthe default TERM signal, and not mixing the ID with a signal\nnumber).\n\nI finally tried this and it works. Once I accidently killed the\ncorresponding ts process instead and this hung the whole queue:\n\nts(3427)---grandpa(3428)-+-father(3429)-+-son(3430)---sleep(3431)\n | `-son(3435)---sleep(3436)\n `-father(3432)-+-son(3433)---sleep(3434)\n `-son(3437)---sleep(3438)\n\n$ kill -- -3427 #should have been 3428\n$ ts\n\nID State Output E-Level Times(r/u/s) Command [run=1/1]\n1 queued (file) grandpa\n2 queued (file) grandpa\n\nKilling the correct pid 3428 didn't help ts to recover. A 'ts -C'\ndid exactly nothing. Finally 'ts -K' allowed to use ts again.\n\nJust as a note\nThomas
20 1 open Log the output of TS_ONFINISH, for the case it gives an error I actually used this for debugging:\n\nexec >> $HOME/${0##*/}.out 2>&1\ndate; echo params: "$@"; echo\nset -xv\n\nand found that the script is run, but that the server worked\ndifferently from what I expected: My script checks the output\nof ts to be sure that it is run via ts and not via cmdline.\n\nAs the script gets some information as parameters, I checked\nwhether these are in the output of ts. What I expected was\nthat ts would show a line like\n\n0 finished /tmp/ts-out.l8jhbI 0 10.05/0.00/0.00 sleep 10\n\nNow I understand that the (just finished) job is shown as\n\n0 running /tmp/ts-out.l8jhbI sleep 10\n\nin this very moment. I adapted my script and now it works. :-)\n\nBTW: Why don't you just log stdout/stderr of $TS_ONFINISH?
22 1 open Add more information to TS_SAVELIST (from ts -i, for example)
23 5 open "ts -S" could return the number of slots set.
+19 10 fixed Killing a child ts blocked the queue ts creates the job in a new Process Group. You can send signals\nto the whole Process Group using a negative number for kill, as\nin: "kill -- -`ts -p`" (the first double hyphen is for sending\nthe default TERM signal, and not mixing the ID with a signal\nnumber).\n\nI finally tried this and it works. Once I accidently killed the\ncorresponding ts process instead and this hung the whole queue:\n\nts(3427)---grandpa(3428)-+-father(3429)-+-son(3430)---sleep(3431)\n | `-son(3435)---sleep(3436)\n `-father(3432)-+-son(3433)---sleep(3434)\n `-son(3437)---sleep(3438)\n\n$ kill -- -3427 #should have been 3428\n$ ts\n\nID State Output E-Level Times(r/u/s) Command [run=1/1]\n1 queued (file) grandpa\n2 queued (file) grandpa\n\nKilling the correct pid 3428 didn't help ts to recover. A 'ts -C'\ndid exactly nothing. Finally 'ts -K' allowed to use ts again.\n\nJust as a note\nThomas
--- a/jobs.c Sun Nov 09 12:53:35 2008 +0100
+++ b/jobs.c Sun Nov 09 22:25:49 2008 +0100
@@ -487,6 +487,18 @@
return;
}
+int job_is_running(int jobid)
+{
+ struct Job *p;
+
+ p = findjob(jobid);
+ if (p == 0)
+ return 0;
+ if (p->state == RUNNING)
+ return 1;
+ return 0;
+}
+
void job_finished(const struct Result *result, int jobid)
{
struct Job *p;
@@ -1238,7 +1250,7 @@
p->output_filename ? p->output_filename : "NULL");
fprintf(out, " store_output %i\n", p->store_output);
fprintf(out, " pid %i\n", p->pid);
- fprintf(out, " should_keep_finished %i\n", p->pid);
+ fprintf(out, " should_keep_finished %i\n", p->should_keep_finished);
}
void dump_jobs_struct(FILE *out)
--- a/main.h Sun Nov 09 12:53:35 2008 +0100
+++ b/main.h Sun Nov 09 22:25:49 2008 +0100
@@ -211,6 +211,7 @@
void s_job_info(int s, int jobid);
void s_send_runjob(int s, int jobid);
void s_set_max_slots(int new_max_slots);
+int job_is_running(int jobid);
/* server.c */
void server_main(int notify_fd, char *_path);
--- a/server.c Sun Nov 09 12:53:35 2008 +0100
+++ b/server.c Sun Nov 09 22:25:49 2008 +0100
@@ -265,6 +265,41 @@
nconnections--;
}
+static void
+clean_after_client_disappeared(int socket, int index)
+{
+ /* Act as if the job ended. */
+ int jobid = client_cs[index].jobid;
+ if (job_is_running(jobid))
+ {
+ struct Result r;
+
+ r.errorlevel = -1;
+ r.died_by_signal = 1;
+ r.signal = SIGKILL;
+ r.user_ms = 0;
+ r.system_ms = 0;
+ r.real_ms = 0;
+ r.skipped = 0;
+
+ warning("JobID %i quit while running.", jobid);
+ job_finished(&r, jobid);
+ /* For the dependencies */
+ check_notify_list(jobid);
+ /* We don't want this connection to do anything
+ * more related to the jobid, secially on remove_connection
+ * when we receive the EOC. */
+ client_cs[index].hasjob = 0;
+ }
+
+ close(socket);
+ /* TODO: if the client dies while its job is running, someone may prefer
+ * to note the job as finished with an error. Now we simply remove
+ * it from the queue. */
+ remove_connection(index);
+ /* It will not fail, even if the index is not a notification */
+ s_remove_notification(socket);
+}
static enum Break
client_read(int index)
@@ -280,21 +315,12 @@
if (res == -1)
{
warning("client recv failed");
- close(s);
- remove_connection(index);
- /* It will not fail, even if the index is not a notification */
- s_remove_notification(s);
+ clean_after_client_disappeared(s, index);
return NOBREAK;
}
else if (res == 0)
{
- close(s);
- /* TODO: if the client dies while its job is running, someone may prefer
- * to note the job as finished with an error. Now we simply remove
- * it from the queue. */
- remove_connection(index);
- /* It will not fail, even if the index is not a notification */
- s_remove_notification(s);
+ clean_after_client_disappeared(s, index);
return NOBREAK;
}