I wrote a python script that works great from the command-line however when I run it from the search in the browser I don't get any results:
#!/usr/bin/env python
import csv
import sys
import email
""" An MIMEDecoder that takes CSV as input, performs a email.Header.decode_header
on the field, then returns the decoded text in CSV results
"""
idx = 0
def decode_between( s, first, last ):
global idx
output = ''
while idx < len(s):
try:
start = s.index( first, idx )
if start > idx:
end = start
tmp = idx
idx = start
return s[tmp:end]
end = s.index( last, start ) + len( last )
idx = end
for part in email.Header.decode_header(s[start:end]) :
output += part[0]
return output.strip()
except ValueError:
# If the encoded string is only a partial string try to decode.
try:
start = s.index( first, idx )
end = len ( s )
idx = end
for part in email.Header.decode_header(s[start:end] + last) :
output += part[0]
return output.strip()
except ValueError:
end = len ( s )
if idx < end:
tmp = idx
idx = end
return s[tmp:end]
return ""
def main():
if len(sys.argv) != 3:
print "Usage: python MIMEDecoder.py [MIME Encoded field] [MIME Decoded field]"
sys.exit(1)
global idx
MIMEEncode = sys.argv[1]
MIMEDecode = sys.argv[2]
infile = sys.stdin
outfile = sys.stdout
r = csv.DictReader(infile)
header = r.fieldnames
w = csv.DictWriter(outfile, fieldnames=r.fieldnames)
w.writeheader()
for result in r:
# Perform the email MIME decode if necessary
idx = 0
if result[MIMEEncode] and result[MIMEDecode]:
# both fields were provided, just pass it along
w.writerow(result)
elif result[MIMEEncode]:
# only the MIMEEcode was provided, add the MIMEDecode field
while idx < len(result[MIMEEncode]):
result[MIMEDecode] += decode_between( result[MIMEEncode], "=?", "?=" )
result[MIMEDecode] = result[MIMEDecode].replace('??','')
if result[MIMEDecode]:
w.writerow(result)
else:
sys.exit(1)
main()
Here is the CSV file I used for input:
encodeTXT,decodeTXT
=?utf-8?Q?RE:=20Most=20Efficient=20Way=20to=20Write=20Values=3F?=,
=?UTF-8?B?KEJOKSBVLlMuIFNtYWxsLUNhcCBTaGFyZXMgUmFsbHkgV2hpbGUgVHJlYXN1cmllcyBSZQ==?=??=?UTF-8?B?dHJlYXQsIE9pbCBH?=,
=?iso-8859-1?B?U2VlIGhvdyBQYW1wZXJzIGFuZCBXYWxtYXJ0IGFyZSBoZWxwaW5nIGxvY2FsIGNoaWxkcmVu?=,
From the command-line this is what I run and the results as expected:
[splunk@splunk bin]$ $SPLUNK_HOME/bin/splunk cmd python $SPLUNK_HOME/etc/system/bin/mime_decoder.py encodeTXT decodeTXT </tmp/input.csv
encodeTXT,decodeTXT
=?utf-8?Q?RE:=20Most=20Efficient=20Way=20to=20Write=20Values=3F?=,RE: Most Efficient Way to Write Values?
=?UTF-8?B?KEJOKSBVLlMuIFNtYWxsLUNhcCBTaGFyZXMgUmFsbHkgV2hpbGUgVHJlYXN1cmllcyBSZQ==?=??=?UTF-8?B?dHJlYXQsIE9pbCBH?=,"(BN) U.S. Small-Cap Shares Rally While Treasuries Retreat, Oil G"
=?iso-8859-1?B?U2VlIGhvdyBQYW1wZXJzIGFuZCBXYWxtYXJ0IGFyZSBoZWxwaW5nIGxvY2FsIGNoaWxkcmVu?=,See how Pampers and Walmart are helping local children
However I get no results when I run it through the gui as so:
|lookup mimedecoder encodeTXT as subject_new|table subject_new, decodeTXT
Given, I have added the correct lines to my transforms.conf and if I use another field such as from_address it works. Note: My script returns the original field if it isn't encoded.
So what am I doing wrong does it not like the equals and question mark characters? Or am I doing something else wrong?
Thank you,
Brian
Ok I have made my script even more complicated and now it displays most results ok, I have only ran across 2 issues. It doesn't like spaces at the beginning or the end of my splunk field I pass in to decode, and iso-2022-jp doesn't decode correctly. If anyone has any suggestions to improve my code that would be great but other than those 2 minor issues it works great:
#!/usr/bin/env python
import csv
import sys
import email
from email.header import Header, decode_header
""" An MIMEDecoder that takes CSV as input, performs a email.Header.decode_header
on the field, then returns the decoded text in CSV results
"""
def getmailheader(header_text, default="ascii"):
"""Decode header_text if needed.
Note: This function works by itself but if there are multiple strings that
need decoded you get encoding in the middle of the results"""
try:
headers=email.Header.decode_header(header_text)
except email.Errors.HeaderParseError:
# If the string doesn't decode correctly try stripping a few end characters
header_len=len(header_text)
if header_len>10:
try:
headers=email.Header.decode_header(header_text[0:header_len-3]+'?=')
except email.Errors.HeaderParseError:
try:
headers=email.Header.decode_header(header_text[0:header_len-4]+'?=')
except email.Errors.HeaderParseError:
try:
headers=email.Header.decode_header(header_text[0:header_len-5]+'?=')
except email.Errors.HeaderParseError:
# If all else fails return ***CORRUPTED***
return "***CORRUPTED***"
for i, (text, charset) in enumerate(headers):
try:
headers[i]=unicode(text, charset or default, errors='replace')
except LookupError:
# if the charset is unknown, force default
headers[i]=unicode(text, default, errors='replace')
return u"".join(headers)
else:
for i, (text, charset) in enumerate(headers):
try:
headers[i]=unicode(text, charset or default, errors='replace')
except LookupError:
# if the charset is unknown, force default
headers[i]=unicode(text, default, errors='replace')
return u"".join(headers)
def decode_subject( subject ):
"""Decode subject string if needed.
Note: This function splits each segment that might need decoded and calls
getmailheader for each part merging the results all together"""
decoded = ''
pointer = 0
length = len(subject)
while pointer < length:
try:
beginning = subject.index('=?', pointer)
if beginning > pointer:
# If we are not currently at the pointer then concatenate string as is to results.
decoded += subject[pointer:beginning]
try:
# Move the point past the character set and encoding.
pointer = subject.index('?B?', pointer + 2) + 3
except ValueError:
try:
pointer = subject.index('?b?', pointer + 2) + 3
except ValueError:
try:
pointer = subject.index('?Q?', pointer + 2) + 3
except ValueError:
try:
pointer = subject.index('?q?', pointer + 2) + 3
except ValueError:
pointer += 2
try:
# Find the end of the encoded text
ending = subject.index('?=', pointer)
pointer = ending + 2
decoded += getmailheader(subject[beginning:ending + 2])
except ValueError:
# If found no end string, add end string and decode the rest field to results and return
pointer = length
decoded += getmailheader(subject[beginning:length] + '?=')
except ValueError:
# Found no beginning string, add the rest field to the results and return
decoded += subject[pointer:length]
pointer = length
return decoded
def main():
if len(sys.argv) != 3:
print "Usage: python MIMEDecoder.py [MIME Encoded field] [MIME Decoded field]"
sys.exit(1)
MIMEEncode = sys.argv[1]
MIMEDecode = sys.argv[2]
infile = sys.stdin
outfile = sys.stdout
r = csv.DictReader(infile)
header = r.fieldnames
w = csv.DictWriter(outfile, fieldnames=r.fieldnames)
w.writeheader()
for result in r:
if result[MIMEEncode] and result[MIMEDecode]:
# both fields were provided, just pass it along
w.writerow(result)
elif result[MIMEEncode]:
# only the MIMEEcode was provided, preform decoding where needed
if result[MIMEEncode].find("=?") == -1:
# If the field does not appear to contain encoded data return original field
result[MIMEDecode] = result[MIMEEncode]
else:
# Else remove extra charaters not part of the encoding and decode the field
result[MIMEDecode] = result[MIMEEncode].replace('??','')
result[MIMEDecode] = result[MIMEDecode].replace('? ','')
result[MIMEDecode] = decode_subject(result[MIMEDecode])
#result[MIMEDecode] = getmailheader(result[MIMEEncode])
if result[MIMEDecode]:
# If successfully decoded then write results
w.writerow(result)
main()
For Python 3, we can use the email module's policy.default to make decoding easier, even if it doesn't include attempts to fix malformed MIME data. I also used argparse, so that test input and output can be used without relying on piping through stdin/stdout.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import csv
import sys
from email import message_from_string
from email import policy
def decode_mime(encoded_field):
# Return original if it does not contain encoded data
if encoded_field.find("=?") == -1:
return encoded_field
# Decode the field and convert to Unicode
else:
try:
return message_from_string(
'MIMEDecode: {}'.format(encoded_field),
policy=policy.default
).get('MIMEDecode')
except:
return encoded_field
def process_line(input_dict, input_field, output_field):
# If the input field is present and the output field is not, populate the output field
if input_dict[input_field] and not input_dict[output_field]:
input_dict[output_field] = decode_mime(input_dict[input_field])
def get_csv_writer(infile, outfile, *args):
reader = csv.DictReader(infile)
header = reader.fieldnames
for arg in args:
if arg not in header:
raise KeyError(f'{arg!r} from command line arguments not found in input CSV headers')
writer = csv.DictWriter(outfile, header)
writer.writeheader()
return reader, writer
def main():
parser = argparse.ArgumentParser(description='Decode RFC 2047 MIME encoded data from a comma separated input.')
parser.add_argument(
'encoded', type=str, nargs=1,
help='CSV field name for the MIME encoded field. '
'Input can contain strings like "=?US-ASCII*EN?Q?Keith_Moore?="')
parser.add_argument(
'decoded', type=str, nargs=1,
help='CSV field name of the MIME field for decoded output. '
'Output will contain the decoded value like "Keith Moore" or the original value provided.')
parser.add_argument('-i', '--infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
help='Input CSV, defaults to stdin')
parser.add_argument('-o', '--outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout,
help='Input CSV, defaults to stdout')
args = parser.parse_args()
infile = args.infile
outfile = args.outfile
arg_list = [
args.encoded[0],
args.decoded[0],
]
reader, writer = get_csv_writer(infile, outfile, *arg_list)
for line in reader:
process_line(line, *arg_list)
writer.writerow(line)
if __name__ == '__main__':
main()
The shell of this function can be used for other Python external lookups in Splunk. The following is an example that doesn't do anything useful.
https://gist.github.com/malvidin/572a006bc808a009e6cd845915625849
The Cisco ESA logs that include the raw subject also include escaped CRLF and Tab characters from the email header folding, like "This subject was folded across\r\n multiple lines".
Adding this to the decode_mime function above will clean ESA subjects before decoding the subject, without removing escaped characters that will appear as "This subject was \\r\\n not folded".
if r"\\t" in encoded_field or r"\\r\\n" in encoded_field:
encoded_field = re.sub(r"(?!<\\)\\r\\n( |\\t)+", " ", encoded_field) # Unfold escaped CRLF
encoded_field = re.sub(r"(?!<\\)\\r\\n$", "", encoded_field) # Trim trailing CRLF
encoded_field = re.sub(r"(?!<\\)\\t", "\t", encoded_field) # Unescape tab characters
This is a working transforms.conf for the Python3 script, which works with Splunk 8.0
[decode_mime]
python.version = python3
allow_caching = 1
case_sensitive_match = 1
external_cmd = decode_mime.py mime_encoded mime_decoded
fields_list = mime_encoded,mime_decoded
Unfortunately, the email.header module in Python 2.7.13 and 3.6 both do not decode in a way that matches RFC 2047. Python 3.6 is more accurate, but Splunk currently uses Python 2.7.13.
Edit: Added an entry to skip decoding for potential ISO-2022 strings. The errors when ISO-2022 strings are attempted to be decoded can impact other events that do not use ISO-2022 encoding.
Related Issue:
https://answers.splunk.com/answers/500264/index.html
Machine Translation of Answer:
The use of the iso-2022 series character set including iso-2022-jp is not supported, so it can not be indexed.
A function expansion request for future releases has been issued at SPL-136289, but the correspondence is undecided.
I recommend using a header_rfc2047.py script based on header.py, with the following changes:
ecre
1.a. Eat whitespace between encoded strings "?=" and "=?"
Note: only checks for the leading "=?", not the next full encoded string
(?:[ \t\r\n]+(?==\?))? # eat whitespace between encodings
decode_header
2.a. Strip leading space characters
`for line in header.splitlines():
line = line.lstrip()
while parts:
2.b. Don't strip spaces from each popped part
unenc = parts.pop(0)`
__unicode__
3.a. Don't add a space character between strings, as appropriate spaces are now kept in the initial decoding
uchunks.append(UEMPTYSTRING)
_encode_chunks
4.a. When appending chunks, only add a space between them if both chunks are encoded.
if chunks and chunks[-1].endswith('?=') and charset != 'us-ascii':
extra = ' '
else:
extra = ''
With these changes, the following script should decode appropriately, but without attempts to fix broken MIME encoded strings.
#!/usr/bin/env python
import csv
import sys
import email
from header_rfc2047 import Header, decode_header, make_header
""" An MIMEDecoder that takes CSV as input, performs decode_header
and make_header on the field, then returns the decoded Unicode
text in CSV results
"""
def main():
if len(sys.argv) != 3:
print("Usage: python MIMEDecoder.py [Encoded field] [Decoded field]")
sys.exit(1)
MIMEEncode = sys.argv[1]
MIMEDecode = sys.argv[2]
infile = sys.stdin
outfile = sys.stdout
r = csv.DictReader(infile)
header = r.fieldnames
w = csv.DictWriter(outfile, fieldnames=r.fieldnames)
w.writeheader()
for result in r:
# both fields were provided, just pass it along
if result[MIMEEncode] and result[MIMEDecode]:
w.writerow(result)
elif result[MIMEEncode]:
# only the MIMEEncode was provided, perform decoding where needed
if result[MIMEEncode].find("=?") == -1:
# return original if it does not contain encoded data
result[MIMEDecode] = result[MIMEEncode]
# if an ISO-2022 character set is detected, don't try
# (remove when Splunk plays nicely with ISO-2022 )
elif result[MIMEEncode].lower().find("=?iso-2022") >= 0:
# return original if it might contain ISO-2022 encoded strings
result[MIMEDecode] = result[MIMEEncode]
else:
# Decode the field and convert to unicode
dh = decode_header(result[MIMEEncode])
result[MIMEDecode] = make_header(dh).__unicode__()
if result[MIMEDecode]:
# If successfully decoded then write results
w.writerow(result)
main()
Splunk doesn't like leading/trailing spaces in the data passed to external lookups. Until a fix is available, there are two workarounds. Add a comma to start of the string and remove it afterwards, or trim the field before sending to the lookup.
<initial search>
| eval subject="," . subject
| lookup mime_decode encoded AS subject OUTPUT decoded AS subject_decoded
| eval subject_decoded = substr(subject_decoded,2)
I don't know the source of the '? ' or '??' in bkirk's data, but I haven't seen it in mine. Most of the decoding errors I saw initially were caused by inconsistencies between Python's email.header module and RFC 2047.
Ok I have made my script even more complicated and now it displays most results ok, I have only ran across 2 issues. It doesn't like spaces at the beginning or the end of my splunk field I pass in to decode, and iso-2022-jp doesn't decode correctly. If anyone has any suggestions to improve my code that would be great but other than those 2 minor issues it works great:
#!/usr/bin/env python
import csv
import sys
import email
from email.header import Header, decode_header
""" An MIMEDecoder that takes CSV as input, performs a email.Header.decode_header
on the field, then returns the decoded text in CSV results
"""
def getmailheader(header_text, default="ascii"):
"""Decode header_text if needed.
Note: This function works by itself but if there are multiple strings that
need decoded you get encoding in the middle of the results"""
try:
headers=email.Header.decode_header(header_text)
except email.Errors.HeaderParseError:
# If the string doesn't decode correctly try stripping a few end characters
header_len=len(header_text)
if header_len>10:
try:
headers=email.Header.decode_header(header_text[0:header_len-3]+'?=')
except email.Errors.HeaderParseError:
try:
headers=email.Header.decode_header(header_text[0:header_len-4]+'?=')
except email.Errors.HeaderParseError:
try:
headers=email.Header.decode_header(header_text[0:header_len-5]+'?=')
except email.Errors.HeaderParseError:
# If all else fails return ***CORRUPTED***
return "***CORRUPTED***"
for i, (text, charset) in enumerate(headers):
try:
headers[i]=unicode(text, charset or default, errors='replace')
except LookupError:
# if the charset is unknown, force default
headers[i]=unicode(text, default, errors='replace')
return u"".join(headers)
else:
for i, (text, charset) in enumerate(headers):
try:
headers[i]=unicode(text, charset or default, errors='replace')
except LookupError:
# if the charset is unknown, force default
headers[i]=unicode(text, default, errors='replace')
return u"".join(headers)
def decode_subject( subject ):
"""Decode subject string if needed.
Note: This function splits each segment that might need decoded and calls
getmailheader for each part merging the results all together"""
decoded = ''
pointer = 0
length = len(subject)
while pointer < length:
try:
beginning = subject.index('=?', pointer)
if beginning > pointer:
# If we are not currently at the pointer then concatenate string as is to results.
decoded += subject[pointer:beginning]
try:
# Move the point past the character set and encoding.
pointer = subject.index('?B?', pointer + 2) + 3
except ValueError:
try:
pointer = subject.index('?b?', pointer + 2) + 3
except ValueError:
try:
pointer = subject.index('?Q?', pointer + 2) + 3
except ValueError:
try:
pointer = subject.index('?q?', pointer + 2) + 3
except ValueError:
pointer += 2
try:
# Find the end of the encoded text
ending = subject.index('?=', pointer)
pointer = ending + 2
decoded += getmailheader(subject[beginning:ending + 2])
except ValueError:
# If found no end string, add end string and decode the rest field to results and return
pointer = length
decoded += getmailheader(subject[beginning:length] + '?=')
except ValueError:
# Found no beginning string, add the rest field to the results and return
decoded += subject[pointer:length]
pointer = length
return decoded
def main():
if len(sys.argv) != 3:
print "Usage: python MIMEDecoder.py [MIME Encoded field] [MIME Decoded field]"
sys.exit(1)
MIMEEncode = sys.argv[1]
MIMEDecode = sys.argv[2]
infile = sys.stdin
outfile = sys.stdout
r = csv.DictReader(infile)
header = r.fieldnames
w = csv.DictWriter(outfile, fieldnames=r.fieldnames)
w.writeheader()
for result in r:
if result[MIMEEncode] and result[MIMEDecode]:
# both fields were provided, just pass it along
w.writerow(result)
elif result[MIMEEncode]:
# only the MIMEEcode was provided, preform decoding where needed
if result[MIMEEncode].find("=?") == -1:
# If the field does not appear to contain encoded data return original field
result[MIMEDecode] = result[MIMEEncode]
else:
# Else remove extra charaters not part of the encoding and decode the field
result[MIMEDecode] = result[MIMEEncode].replace('??','')
result[MIMEDecode] = result[MIMEDecode].replace('? ','')
result[MIMEDecode] = decode_subject(result[MIMEDecode])
#result[MIMEDecode] = getmailheader(result[MIMEEncode])
if result[MIMEDecode]:
# If successfully decoded then write results
w.writerow(result)
main()
Hi Brian,
I made your script into TA add-on as Splunk custom command compatible with v2 protocol. You are mentioned as an author too.
MIME Decoder Add-on
https://splunkbase.splunk.com/app/5116/
Tomas
Well here we go again I found the answer to my own question. The information that I provided probably couldn't have been figured out. My subject was extracted using a rex and the ones that contained a the encoded content also had a leading space that messed up my python script.
Hope that my mistake helps someone else out sometime.
Thank you,
Brian
Got the same problem with the subject of events ingested from a Fortimail appliance. This script worked perfectly.
Thanks 🙂