Fixing the problem of ts clients being killed. The queue should be reusable
authorviric@mandarina
Sun, 09 Nov 2008 22:25:49 +0100
changeset 252 f0af96f58947
parent 251 5e0802df5788
child 253 e9b153f4ae40
Fixing the problem of ts clients being killed. The queue should be reusable now.
buglist.bug
jobs.c
main.h
server.c
--- a/buglist.bug	Sun Nov 09 12:53:35 2008 +0100
+++ b/buglist.bug	Sun Nov 09 22:25:49 2008 +0100
@@ -1,7 +1,7 @@
-24
+25
 0	1	sleep	recv() may return ECONNRESET	\nI don't know why. It happened sometimes, when the client didn't send any\nENDJOB message.\nIt should be investigated.\n\nHypothesis:\n- maybe recv() returns ECONNRESET if the other side process has died but not\n  closed()?\n  simple tests seem to show that it's not true.
 18	1	open	Making a different TS_SAVELIST name on each socket/date	
-19	10	open	Killing a child ts blocked the queue	ts creates the job in a new Process Group. You can send signals\nto the whole Process Group using a negative number for kill, as\nin: "kill -- -`ts -p`"  (the first double hyphen is for sending\nthe default TERM signal, and not mixing the ID with a signal\nnumber).\n\nI finally tried this and it works. Once I accidently killed the\ncorresponding ts process instead and this hung the whole queue:\n\nts(3427)---grandpa(3428)-+-father(3429)-+-son(3430)---sleep(3431)\n                        |               `-son(3435)---sleep(3436)\n                         `-father(3432)-+-son(3433)---sleep(3434)\n                                         `-son(3437)---sleep(3438)\n\n$ kill -- -3427 #should have been 3428\n$ ts\n\nID  State   Output  E-Level  Times(r/u/s)  Command [run=1/1]\n1   queued  (file)                         grandpa\n2   queued  (file)                         grandpa\n\nKilling the correct pid 3428 didn't help ts to recover. A 'ts -C'\ndid exactly nothing. Finally 'ts -K' allowed to use ts again.\n\nJust as a note\nThomas
 20	1	open	Log the output of TS_ONFINISH, for the case it gives an error	I actually used this for debugging:\n\nexec >> $HOME/${0##*/}.out 2>&1\ndate; echo params: "$@"; echo\nset -xv\n\nand found that the script is run, but that the server worked\ndifferently from what I expected: My script checks the output\nof ts to be sure that it is run via ts and not via cmdline.\n\nAs the script gets some information as parameters, I checked\nwhether these are in the output of ts. What I expected was\nthat ts would show a line like\n\n0    finished   /tmp/ts-out.l8jhbI   0        10.05/0.00/0.00 sleep 10\n\nNow I understand that the (just finished) job is shown as\n\n0    running    /tmp/ts-out.l8jhbI                           sleep 10\n\nin this very moment. I adapted my script and now it works. :-)\n\nBTW: Why don't you just log stdout/stderr of $TS_ONFINISH?
 22	1	open	Add more information to TS_SAVELIST (from ts -i, for example)	
 23	5	open	"ts -S" could return the number of slots set.	
+19	10	fixed	Killing a child ts blocked the queue	ts creates the job in a new Process Group. You can send signals\nto the whole Process Group using a negative number for kill, as\nin: "kill -- -`ts -p`"  (the first double hyphen is for sending\nthe default TERM signal, and not mixing the ID with a signal\nnumber).\n\nI finally tried this and it works. Once I accidently killed the\ncorresponding ts process instead and this hung the whole queue:\n\nts(3427)---grandpa(3428)-+-father(3429)-+-son(3430)---sleep(3431)\n                        |               `-son(3435)---sleep(3436)\n                         `-father(3432)-+-son(3433)---sleep(3434)\n                                         `-son(3437)---sleep(3438)\n\n$ kill -- -3427 #should have been 3428\n$ ts\n\nID  State   Output  E-Level  Times(r/u/s)  Command [run=1/1]\n1   queued  (file)                         grandpa\n2   queued  (file)                         grandpa\n\nKilling the correct pid 3428 didn't help ts to recover. A 'ts -C'\ndid exactly nothing. Finally 'ts -K' allowed to use ts again.\n\nJust as a note\nThomas
--- a/jobs.c	Sun Nov 09 12:53:35 2008 +0100
+++ b/jobs.c	Sun Nov 09 22:25:49 2008 +0100
@@ -487,6 +487,18 @@
     return;
 }
 
+int job_is_running(int jobid)
+{
+    struct Job *p;
+
+    p = findjob(jobid);
+    if (p == 0)
+        return 0;
+    if (p->state == RUNNING)
+        return 1;
+    return 0;
+}
+
 void job_finished(const struct Result *result, int jobid)
 {
     struct Job *p;
@@ -1238,7 +1250,7 @@
             p->output_filename ? p->output_filename : "NULL");
     fprintf(out, "    store_output %i\n", p->store_output);
     fprintf(out, "    pid %i\n", p->pid);
-    fprintf(out, "    should_keep_finished %i\n", p->pid);
+    fprintf(out, "    should_keep_finished %i\n", p->should_keep_finished);
 }
 
 void dump_jobs_struct(FILE *out)
--- a/main.h	Sun Nov 09 12:53:35 2008 +0100
+++ b/main.h	Sun Nov 09 22:25:49 2008 +0100
@@ -211,6 +211,7 @@
 void s_job_info(int s, int jobid);
 void s_send_runjob(int s, int jobid);
 void s_set_max_slots(int new_max_slots);
+int job_is_running(int jobid);
 
 /* server.c */
 void server_main(int notify_fd, char *_path);
--- a/server.c	Sun Nov 09 12:53:35 2008 +0100
+++ b/server.c	Sun Nov 09 22:25:49 2008 +0100
@@ -265,6 +265,41 @@
     nconnections--;
 }
 
+static void
+clean_after_client_disappeared(int socket, int index)
+{
+    /* Act as if the job ended. */
+    int jobid = client_cs[index].jobid;
+    if (job_is_running(jobid))
+    {
+        struct Result r;
+
+        r.errorlevel = -1;
+        r.died_by_signal = 1;
+        r.signal = SIGKILL;
+        r.user_ms = 0;
+        r.system_ms = 0;
+        r.real_ms = 0;
+        r.skipped = 0;
+
+        warning("JobID %i quit while running.", jobid);
+        job_finished(&r, jobid);
+        /* For the dependencies */
+        check_notify_list(jobid);
+        /* We don't want this connection to do anything
+         * more related to the jobid, secially on remove_connection
+         * when we receive the EOC. */
+        client_cs[index].hasjob = 0;
+    }
+
+    close(socket);
+    /* TODO: if the client dies while its job is running, someone may prefer
+     * to note the job as finished with an error. Now we simply remove
+     * it from the queue. */
+    remove_connection(index);
+    /* It will not fail, even if the index is not a notification */
+    s_remove_notification(socket);
+}
 
 static enum Break
     client_read(int index)
@@ -280,21 +315,12 @@
     if (res == -1)
     {
         warning("client recv failed");
-        close(s);
-        remove_connection(index);
-        /* It will not fail, even if the index is not a notification */
-        s_remove_notification(s);
+        clean_after_client_disappeared(s, index);
         return NOBREAK;
     }
     else if (res == 0)
     {
-        close(s);
-        /* TODO: if the client dies while its job is running, someone may prefer
-         * to note the job as finished with an error. Now we simply remove
-         * it from the queue. */
-        remove_connection(index);
-        /* It will not fail, even if the index is not a notification */
-        s_remove_notification(s);
+        clean_after_client_disappeared(s, index);
         return NOBREAK;
     }