114114* After creating a new database or installing queue_job on an
115115 existing database, Odoo must be restarted for the runner to detect it.
116116
117- * When Odoo shuts down normally, it waits for running jobs to finish.
118- However, when the Odoo server crashes or is otherwise force-stopped,
119- running jobs are interrupted while the runner has no chance to know
120- they have been aborted. In such situations, jobs may remain in
121- ``started`` or ``enqueued`` state after the Odoo server is halted.
122- Since the runner has no way to know if they are actually running or
123- not, and does not know for sure if it is safe to restart the jobs,
124- it does not attempt to restart them automatically. Such stale jobs
125- therefore fill the running queue and prevent other jobs to start.
126- You must therefore requeue them manually, either from the Jobs view,
127- or by running the following SQL statement *before starting Odoo*:
128-
129- .. code-block:: sql
130-
131- update queue_job set state='pending' where state in ('started', 'enqueued')
132-
133117.. rubric:: Footnotes
134118
135119.. [1] From a security standpoint, it is safe to have an anonymous HTTP
155139from odoo .tools import config
156140
157141from . import queue_job_config
158- from .channels import ENQUEUED , NOT_DONE , PENDING , ChannelManager
142+ from .channels import ENQUEUED , NOT_DONE , ChannelManager
159143
160144SELECT_TIMEOUT = 60
161145ERROR_RECOVERY_DELAY = 5
@@ -207,35 +191,14 @@ def _connection_info_for(db_name):
207191
208192
209193def _async_http_get (scheme , host , port , user , password , db_name , job_uuid ):
210- # Method to set failed job (due to timeout, etc) as pending,
211- # to avoid keeping it as enqueued.
212- def set_job_pending ():
213- connection_info = _connection_info_for (db_name )
214- conn = psycopg2 .connect (** connection_info )
215- conn .set_isolation_level (ISOLATION_LEVEL_AUTOCOMMIT )
216- with closing (conn .cursor ()) as cr :
217- cr .execute (
218- "UPDATE queue_job SET state=%s, "
219- "date_enqueued=NULL, date_started=NULL "
220- "WHERE uuid=%s and state=%s "
221- "RETURNING uuid" ,
222- (PENDING , job_uuid , ENQUEUED ),
223- )
224- if cr .fetchone ():
225- _logger .warning (
226- "state of job %s was reset from %s to %s" ,
227- job_uuid ,
228- ENQUEUED ,
229- PENDING ,
230- )
231-
232194 # TODO: better way to HTTP GET asynchronously (grequest, ...)?
233195 # if this was python3 I would be doing this with
234196 # asyncio, aiohttp and aiopg
235197 def urlopen ():
236198 url = "{}://{}:{}/queue_job/runjob?db={}&job_uuid={}" .format (
237199 scheme , host , port , db_name , job_uuid
238200 )
201+ # pylint: disable=except-pass
239202 try :
240203 auth = None
241204 if user :
@@ -249,10 +212,10 @@ def urlopen():
249212 # for codes between 500 and 600
250213 response .raise_for_status ()
251214 except requests .Timeout :
252- set_job_pending ()
215+ # A timeout is a normal behaviour, it shouldn't be logged as an exception
216+ pass
253217 except Exception :
254218 _logger .exception ("exception in GET %s" , url )
255- set_job_pending ()
256219
257220 thread = threading .Thread (target = urlopen )
258221 thread .daemon = True
@@ -343,6 +306,93 @@ def set_job_enqueued(self, uuid):
343306 (ENQUEUED , uuid ),
344307 )
345308
309+ def _query_requeue_dead_jobs (self ):
310+ return """
311+ UPDATE
312+ queue_job
313+ SET
314+ state=(
315+ CASE
316+ WHEN
317+ max_retries IS NOT NULL AND
318+ retry IS NOT NULL AND
319+ retry>max_retries
320+ THEN 'failed'
321+ ELSE 'pending'
322+ END),
323+ retry=(
324+ CASE
325+ WHEN state='started'
326+ THEN COALESCE(retry,0)+1 ELSE retry
327+ END),
328+ exc_name=(
329+ CASE
330+ WHEN
331+ max_retries IS NOT NULL AND
332+ retry IS NOT NULL AND
333+ retry>max_retries
334+ THEN 'JobFoundDead'
335+ ELSE exc_name
336+ END),
337+ exc_info=(
338+ CASE
339+ WHEN
340+ max_retries IS NOT NULL AND
341+ retry IS NOT NULL AND
342+ retry>max_retries
343+ THEN 'Job found dead after too many retries'
344+ ELSE exc_info
345+ END)
346+ WHERE
347+ id in (
348+ SELECT
349+ queue_job_id
350+ FROM
351+ queue_job_lock
352+ WHERE
353+ queue_job_id in (
354+ SELECT
355+ id
356+ FROM
357+ queue_job
358+ WHERE
359+ state IN ('enqueued','started')
360+ AND date_enqueued <
361+ (now() AT TIME ZONE 'utc' - INTERVAL '10 sec')
362+ )
363+ FOR UPDATE SKIP LOCKED
364+ )
365+ RETURNING uuid
366+ """
367+
368+ def requeue_dead_jobs (self ):
369+ """
370+ Set started and enqueued jobs but not locked to pending
371+
372+ A job is locked when it's being executed
373+ When a job is killed, it releases the lock
374+
375+ If the number of retries exceeds the number of max retries,
376+ the job is set as 'failed' with the error 'JobFoundDead'.
377+
378+ Adding a buffer on 'date_enqueued' to check
379+ that it has been enqueued for more than 10sec.
380+ This prevents from requeuing jobs before they are actually started.
381+
382+ When Odoo shuts down normally, it waits for running jobs to finish.
383+ However, when the Odoo server crashes or is otherwise force-stopped,
384+ running jobs are interrupted while the runner has no chance to know
385+ they have been aborted.
386+ """
387+
388+ with closing (self .conn .cursor ()) as cr :
389+ query = self ._query_requeue_dead_jobs ()
390+
391+ cr .execute (query )
392+
393+ for (uuid ,) in cr .fetchall ():
394+ _logger .warning ("Re-queued dead job with uuid: %s" , uuid )
395+
346396
347397class QueueJobRunner :
348398 def __init__ (
@@ -424,6 +474,11 @@ def initialize_databases(self):
424474 self .channel_manager .notify (db_name , * job_data )
425475 _logger .info ("queue job runner ready for db %s" , db_name )
426476
477+ def requeue_dead_jobs (self ):
478+ for db in self .db_by_name .values ():
479+ if db .has_queue_job :
480+ db .requeue_dead_jobs ()
481+
427482 def run_jobs (self ):
428483 now = _odoo_now ()
429484 for job in self .channel_manager .get_jobs_to_run (now ):
@@ -516,6 +571,7 @@ def run(self):
516571 _logger .info ("database connections ready" )
517572 # inner loop does the normal processing
518573 while not self ._stop :
574+ self .requeue_dead_jobs ()
519575 self .process_notifications ()
520576 self .run_jobs ()
521577 self .wait_notification ()
0 commit comments