Ended up editing AzureStorageTable.py to make it work. I edited/added the following:
import md5, hashlib
.
.
def get_encoded_csv_file_path(checkpoint_dir, file_name):
name = ""
for i in range(len(file_name)):
if file_name[i].isalnum():
name += file_name[i]
else:
name += "_"
name = name[:100]
m = hashlib.md5() # Updated to hashlib since md5.new() is depricated and slower
m.update(file_name)
name += "_" + m.hexdigest() + ".csv.gz"
return os.path.join(checkpoint_dir, name)
# This is used to convert a datetime object to ticks
def dateTimetoTicks(dt):
ticksBeforeEpoch = 621355968000000000
ticksSinceEpochToDateTime = (dt - datetime.datetime.utcfromtimestamp(0)).total_seconds() * 10000000
long(ticksSinceEpochToDateTime)
totalTime = long(ticksBeforeEpoch + ticksSinceEpochToDateTime)
return totalTime
.
.
# Default the start date to 2 days ago
dateTimeStart = (datetime.datetime.today() - datetime.timedelta(days=2))
dateTimeStart = dateTimetoTicks(dateTimeStart)
if not date_time_start in ['',None]:
# if a start time was specified in the config, use that value instead of the default
TempdateTimeStart = dateutil.parser.parse(date_time_start)
dateTimeStart = dateTimetoTicks(TempdateTimeStart)
.
.
if marker is not None:
dateTimeStart = marker
# NEW filter_string with ticks as filter - appending L for specifying Int64
filter_string = "%s gt %sL" % (date_time_column, dateTimeStart)
.
.
# Update to check for if entity is long
if not isinstance(entity[date_time_column], long):
# if the entity column is not a datetime, try to convert it
try:
entity[date_time_column] = dateTimetoTicks(entity[date_time_column])
.
.
if isinstance(entity[date_time_column], long) and (entity[date_time_column] > last_dateTime):
# compare this entitiy's datetime to the last_dateTime variable
last_dateTime = entity[date_time_column]
Querying PartitionKey (ticks) is much faster than querying datetime-objects as explained here
http://www.codeproject.com/Tips/671361/Some-tips-and-tricks-for-azure-table-storage
Cheers!
... View more