I have the following query with multiple joins and using max=0 which is not giving me all results as I think the size becomes more and its unable to take the load.How best can I optimize my query index=syslog (process=*epilog* "*slurm-epilog: START user*") OR (process=*prolog* "*slurm-prolog: END user*") host IN (preo*) timeformat="%Y-%m-%dT%H:%M:%S.%6N"
| rex field=_raw "(?<epilog_start>[^ ]+)\-\d{2}\:\d{2}.*slurm\-epilog\:\s*START\suser\=(?<user>[^\s]+)\sjob\=(?<job_id>[^ ]+)"
| rex field=_raw "(?<prolog_end>[^ ]+)\-\d{2}\:\d{2}.*slurm\-prolog\:\s*END\suser\=(?<user>[^\s]+)\sjob\=(?<job_id>[^ ]+)"
| stats values(epilog_start) as epilog_start values(prolog_end) as prolog_end first(_time) as _time by host user job_id
| search prolog_end!=""
| search user=*
| search job_id=*
| eval current_time = now()
| join host max=0 type=left
[
search index=syslog "Linux version" host IN (preos*)
| rex field=_raw "(?<reboot_time>[^ ]+)\-\d{2}\:\d{2}.*kernel:\s.*Linux\sversion"
| stats count by reboot_time host
]
| convert timeformat="%Y-%m-%dT%H:%M:%S.%6N" mktime(prolog_end) as job_start
| convert timeformat="%Y-%m-%dT%H:%M:%S.%6N" mktime(epilog_start) as epilog_start
| convert timeformat="%Y-%m-%dT%H:%M:%S.%6N" mktime(reboot_time) as reboot_time
| eval diff_reboot = reboot_time - job_start
| eval reboot_time = if(diff_reboot<0, "", reboot_time)
| eval diff_reboot = if(diff_reboot<0, "", diff_reboot)
| eval job_end = if(epilog_start!="",epilog_start,if(diff_reboot>0,reboot_time,current_time))
| eval diff_reboot = if(diff_reboot!="",diff_reboot,10000000000)
| sort host user job_id prolog_end diff_reboot
| dedup host user job_id prolog_end
| join host max=0 type=left
[
search index=syslog (("NVRM: Xid*") process=kernel) host IN (preos*)
| rex field=_raw "(?<kernel_xid>[^ ]+)\-\d{2}\:\d{2}.*NVRM\:\sXid\s*\(PCI\:(?<PCIe_Bus_Id>[^ ]+)\)\:\s*(?<Error_Code>[^ ]+)\,\spid\=(?<pid>[^ ]+)\,\s*name\=(?<name>[^ ]+)\,\s(?<Log_Message>.*)"
| stats count by host kernel_xid PCIe_Bus_Id pid name Error_Code Log_Message
| search Error_Code="***"
]
| search kernel_xid!=""
| convert timeformat="%Y-%m-%dT%H:%M:%S.%6N" mktime(kernel_xid) as xid_time
| eval diff = xid_time - job_start
| eval diff2 = job_end - xid_time
| search diff>0 AND diff2>0
| eval job_start = strftime(job_start,"%Y-%m-%d %H:%M:%S.%3N")
| eval job_end = if(job_end==current_time,"N/A",strftime(job_end,"%Y-%m-%d %H:%M:%S.%3N"))
| eval xid_time = strftime(xid_time,"%Y-%m-%d %H:%M:%S.%3N")
| eval current_time = strftime(current_time,"%Y-%m-%d %H:%M:%S.%3N")
| eval reboot_time = strftime(reboot_time,"%Y-%m-%d %H:%M:%S.%3N")
| join user job_id type=left
[ search index="slurm-jobs"
| stats count by job_id job_name user nodelist time_start time_end state submit_line
]
| eval _time=strptime(xid_time,"%Y-%m-%d %H:%M:%S.%3N")
| stats count by host user job_id job_start job_end xid_time Error_Code PCIe_Bus_Id pid name Log_Message job_name nodelist state submit_line
| dedup host Below are the sample logs Events First search 2022-11-08T14:59:53.134550-08:00 preos slurm-epilog: START user=abc job=62112 2022-11-08T14:58:25.101203-08:00 preos slurm-prolog: END user=abc job=62112 Subsearch after join events: 2022-11-09T12:49:51.395174-08:00 preos kernel: [ 0.000000] Linux version hjhc (buildd@lcy02-amd64-032) (gcc (Ubuntu 11.2.0-19ubuntu1) 11.2.0, GNU ld (GNU Binutils for Ubuntu) 2.38) #58-Ubuntu SMP Thu Oct 13 08:03:55 UTC 2022 (Ubuntu 5.15.0-52.58-generic 5.15.60) Events from second join: 2022-11-09T12:35:15.422001-08:00 preos kernel: [ 99.166912] NVRM: Xid (PCI:00:2:00): 95, pid='<unknown>', name=<unknown>, Uncontained: FBHUB. RST: Yes, D-RST: No Events from last join: 2022-11-09 20:50:02.000, mod_time="1668027082", job_id="62112", job_name="rfm_nvcp_bek_cd_job", user="abc", account="admin", partition="vi2", qos="all", resv="YES", timelimit_minutes="10", work_dir="/sbatch/logs/2022-11-09T11-30-39/preos/viking-hbm2/builtin/nvcomp_benchmark_cascaded", submit_line="sbatch rfm_nvcomp_benchmark_cascaded_job.sh", time_submit="2022-11-09 12:11:13", time_eligible="2022-11-09 12:11:13", time_start="2022-11-09 12:50:02", time_end="2022-11-09 12:51:22", state="COMPLETED", exit_code="0", nodes_alloc="1", nodelist="preos0093", submit_to_start_time="00:38:49", eligible_to_start_time="00:38:49", start_to_end_time="00:01:20" Thanks in Advance
... View more