Change from
Actual_End_Time > ExpectedEnd
to
(Actual_End_Time > ExpectedEnd +300)
to give 5 minutes (300 seconds) leeway.
The other one requires a major refactor... which the query does need...
Okay, here's the refactor. This should SCREAM in speed relative to the other one.
Obviously, it is air code, since we don't have your data to test with.
index=tomcat source ="/files0/nlhyp*" [Job] cronjob (Action=Starting OR Action=Finished)
| fields _time CronJobName Action
| rename COMMENT as "Find the latest _time for each action for each CronJobName, then roll them together"
| stats max(_time) as _time by CronJobName Action
| stats max(eval(case(Action="Starting",_time))) as Actual_Start_Epoch, max(eval(case(Action="Finished",_time))) as Actual_End_Epoch by CronJobName
| rename COMMENT as "Default the start time to end time if start is missing, then kill the end time if it's before the start time."
| eval Actual_Start_Epoch=coalesce(Actual_Start_Epoch, Actual_End_Epoch)
| eval Actual_End_Epoch=case(Actual_End_Epoch>=Actual_Start_Epoch, Actual_End_Epoch)
| rename COMMENT as "look up the time and frequency for each CronJobName"
| lookup CronJobLookup.csv CronJobName OUTPUT Job_Frequency_min Expected_Start_Time Expected_End_Time
| fields CronJobName Actual_Start_Epoch Actual_End_Epoch Job_Frequency_min Expected_Start_Time Expected_End_Time
| rename COMMENT as "Mark the records so we can delete each one after handling it one of two ways"
| eval rectype="detail1"
| rename COMMENT as "This section handles the once-only jobs"
| appendpipe [
| where rectype="detail1" AND Job_Frequency_min="Once"
| eval Current_Date=strftime(now(),"%Y-%m-%d")
| eval Expected_Start=Current_Date." ".Expected_Start_Time
| eval Expected_Start_Epoch=strptime(Expected_Start,"%Y-%m-%d %H:%M:%S")
| eval Expected_End=Current_Date." ".Expected_End_Time
| eval Expected_End_Epoch=strptime(Expected_End,"%Y-%m-%d %H:%M:%S")
| eval Actual_Start=strftime(Actual_Start_Epoch,"%Y-%m-%d %H:%M:%S")
| eval Actual_End=strftime(Actual_End_Epoch,"%Y-%m-%d %H:%M:%S")
| eval Status = case(
isnull(Actual_End_Epoch),"Not Complete",
Actual_End_Epoch <= Expected_End_Epoch + 300, "OK",
true(), "Over Run")
| table CronJobName Job_Frequency_min Expected_Start Expected_End Actual_Start Actual_End Status
| eval rectype="detailOnce"
]
| rename COMMENT as "This section handles the more-than-once jobs"
| rename COMMENT as "Assumes CronJobLookup.csv table has one record per CronJobName, w expected start and end time of one job. "
| appendpipe [
| where rectype="detail1" AND Job_Frequency_min!="Once"
| rename COMMENT as "Calculate the epoch time of the 'Anchor' version of the Expected Start Time "
| eval Current_Date=strftime(Actual_Start_Epoch,"%Y-%m-%d")
| eval Anchor_Start=CurrentDate." ".Expected_Start_Time
| eval Anchor_Start_Epoch=coalesce(strptime(Anchor_Start,"%Y-%m-%d %H:%M:%S")
| rename COMMENT as "Calculate the job frequency in seconds and figure out how many steps we are probably away from the anchor time"
| eval Job_Frequency_Sec=if(isnum(Job_Frequency_min),60*tonumber(Job_Frequency_min),600)
| eval steps_off=round( (Actual_Start_Epoch-Anchor_Start_Epoch) / Job_Frequency_Sec,0)
| eval secs_off=Job_Frequency_Sec*steps_off
| rename COMMENT as "Calculate the Expected Start Time given the timestep"
| eval Expected_Start_Epoch=Anchor_Start_Epoch+secs_off
| rename COMMENT as "Calculate the Expected End Time given the timestep"
| eval Expected_End=CurrentDate." ".Expected_End_Time
| eval Expected_End_Epoch=strptime(Expected_End,"%Y-%m-%d %H:%M:%S")+secs_off
| eval Expected_End=strftime(Expected_End_Epoch,"%Y-%m-%d %H:%M:%S")
| eval Status = case(
isnull(Actual_End_Epoch),"Not Complete",
Actual_End_Epoch <= Expected_End_Epoch +300, "OK",
true(), "Over Run")
| table CronJobName Job_Frequency_min Expected_Start Expected_End Actual_Start Actual_End Status
| eval rectype="detailMultiples"
]
| rename COMMENT as "Now we get rid of the pre-processing details"
| where rectype!="detail1"
| rename COMMENT as "and sort either by name or by start or end date (see note 1)"
| sort 0 CronJobName
NOTES -
1) The dates are now presented in ISO format, so that you can sort by date/time without converting back to epoch. If you get your techs and users used to looking at yyyy-mm-dd, then it will save you huge headaches in the long run, because ISO format cannot be misinterpreted. (No one uses yyyy-dd-mm.)
2) It should be easy enough to modify the second appendpipe based on different assumptions.
... View more