I changed the script to follow the __next links and process the additional logs. I don't have enough karma to attach a file, so I've pasted it below. Replace %SPLUNK_DIR%/etc/apps/TA-MS_O365_Reporting/bin/input_module_ms_o365_message_trace.py with the below and it should get all the messages for the polling period.
# encoding = utf-8
import os
import sys
import time
import datetime
import re
import requests
import json
import dateutil.parser
def validate_input(helper, definition):
"""Implement your own validation logic to validate the input stanza configurations"""
# This example accesses the modular input variable
# microsoft_office_365_account = definition.parameters.get('microsoft_office_365_account', None)
pass
def get_start_date(helper, check_point_key):
# Try to get a date from the check point first
d = helper.get_check_point(check_point_key)
# If there was a check point date, retun it.
if (d not in [None,'']):
return dateutil.parser.parse(d["max_date"])
else:
# No check point date, so look if a start date was specified as an argument
d = helper.get_arg("start_date_time")
if (d not in [None,'']):
return dateutil.parser.parse(d)
else:
# If there was no start date specified, default to 5 days ago
return datetime.datetime.now() - datetime.timedelta(days=5)
def get_last_epoch(helper, check_point_key):
e = helper.get_check_point(check_point_key)
if (e not in [None,'']):
return e["max_epoch"]
else:
return 0
def collect_events(helper, ew):
global_microsoft_office_365_username = helper.get_arg("office_365_account")["username"]
global_microsoft_office_365_password = helper.get_arg("office_365_account")["password"]
index_metadata = helper.get_arg("index_metadata")
check_point_key = "%s_obj_checkpoint" % helper.get_input_stanza_names()
start_date = get_start_date(helper, check_point_key)
# Sometimes the Subject and FromIP is set to null and Size 0 from MS.
# We are probably fetching the log from them while they have not synced properly which means we get bad data into splunk.
# We increased the end_date to have 180s delay instead of using now().
end_date = datetime.datetime.utcnow() - datetime.timedelta(seconds=180)
microsoft_trace_url = "https://reports.office365.com/ecp/reportingwebservice/reporting.svc/MessageTrace?$format=json&orderby=Received asc&$filter=StartDate eq datetime'%sZ' and EndDate eq datetime'%sZ'" % (start_date.isoformat(), end_date.isoformat())
helper.log_debug("Endpoint URL: %s" % microsoft_trace_url)
r = requests.get(microsoft_trace_url, auth=requests.auth.HTTPBasicAuth(global_microsoft_office_365_username, global_microsoft_office_365_password))
try:
r.raise_for_status()
data = r.json()
max_date = start_date
max_epoch = get_last_epoch(helper, check_point_key)
current_max_epoch = max_epoch
for message_trace in data["d"]["results"]:
# According to https://msdn.microsoft.com/en-us/library/office/jj984335.aspx
# The StartDate and EndDate fields do not provide useful information in the report results...
message_trace.pop("StartDate")
message_trace.pop("EndDate")
if not index_metadata:
message_trace.pop("__metadata")
# Convert the /Date()/ format returned from the JSON and create a new field
_received = re.search('/Date\((.+?)\)/', message_trace["Received"])
if(_received):
t = int(_received.group(1))
# There is a chance that we could ingest duplicate data due to date granularity.
# This check should catch those situations.
if t <= max_epoch:
continue
d = datetime.datetime.utcfromtimestamp(t/1000)
message_trace["DateReceived"] = d.isoformat() + "Z"
# Keep up with the max received date
max_date = max([max_date, d])
# Keep up with the max epoch as well for greater precision
current_max_epoch = max([current_max_epoch,t])
e = helper.new_event(source=helper.get_input_type(), index=helper.get_output_index(), sourcetype=helper.get_sourcetype(), data=json.dumps(message_trace))
ew.write_event(e)
_next = data["d"].get("__next",0)
helper.log_debug("Next URL: %s" % _next)
while _next :
_next = collect_events_next(helper,ew,_next)
checkpoint_data = {}
checkpoint_data["max_date"] = str(max_date)
checkpoint_data["max_epoch"] = current_max_epoch
helper.save_check_point(check_point_key, checkpoint_data)
except Exception as e:
helper.log_error("HTTP Request error: %s" % str(e))
def collect_events_next(helper, ew, _next):
global_microsoft_office_365_username = helper.get_arg("office_365_account")["username"]
global_microsoft_office_365_password = helper.get_arg("office_365_account")["password"]
index_metadata = helper.get_arg("index_metadata")
check_point_key = "%s_obj_checkpoint" % helper.get_input_stanza_names()
start_date = get_start_date(helper, check_point_key)
microsoft_trace_url = _next + "&$format=json&orderby=Received%20asc"
helper.log_debug("Endpoint URL: %s" % microsoft_trace_url)
r = requests.get(microsoft_trace_url, auth=requests.auth.HTTPBasicAuth(global_microsoft_office_365_username, global_microsoft_office_365_password))
try:
r.raise_for_status()
data = r.json()
max_date = start_date
max_epoch = get_last_epoch(helper, check_point_key)
current_max_epoch = max_epoch
for message_trace in data["d"]["results"]:
# According to https://msdn.microsoft.com/en-us/library/office/jj984335.aspx
# The StartDate and EndDate fields do not provide useful information in the report results...
message_trace.pop("StartDate")
message_trace.pop("EndDate")
if not index_metadata:
message_trace.pop("__metadata")
# Convert the /Date()/ format returned from the JSON and create a new field
_received = re.search('/Date\((.+?)\)/', message_trace["Received"])
if(_received):
t = int(_received.group(1))
# There is a chance that we could ingest duplicate data due to date granularity.
# This check should catch those situations.
if t <= max_epoch:
continue
d = datetime.datetime.utcfromtimestamp(t/1000)
message_trace["DateReceived"] = d.isoformat() + "Z"
# Keep up with the max received date
max_date = max([max_date, d])
# Keep up with the max epoch as well for greater precision
current_max_epoch = max([current_max_epoch,t])
e = helper.new_event(source=helper.get_input_type(), index=helper.get_output_index(), sourcetype=helper.get_sourcetype(), data=json.dumps(message_trace))
ew.write_event(e)
_next=data["d"].get('__next',0)
helper.log_debug("Next URL: %s" % _next)
return _next
except Exception as e:
helper.log_error("HTTP Request error: %s" % str(e))
... View more