in python_sample.py [0:0]
def retrieve_objects(bucket, keys, start_time, end_time):
start_time = pd.to_datetime(start_time, unit='s')
end_time = pd.to_datetime(end_time, unit='s')
client = get_s3_client()
# clearing content of the output file
with open('output.csv', 'w') as f_output:
pass
with open('output.csv', 'a') as f_output:
for key in keys:
try:
response = client.get_object(Bucket=bucket, Key=key)
data = StringIO(gzip.open(BytesIO(response['Body'].read()), 'rt').read())
df = pd.read_json(data, lines=True)
if df['Timestamp'].iloc[-1] < start_time:
continue
elif df['Timestamp'].iloc[0] > end_time:
break
# filtering out TRADES only
df = df[df['Type'] == "OrderTradeReportEvent"]
df = df[(df['Timestamp'] >= start_time) & (df['Timestamp'] <= end_time)]
# additional processing may be applied here using pandas, e.g. drop columns, change date format
# for example df['Timestamp'] = pd.to_datetime(df['Timestamp'])
# or df['Timestamp'] = df['Timestamp'].dt.strftime('%d-%m-%Y')
if not df.empty:
df.to_csv(f_output, header=f_output.tell() == 0, index=False)
except botocore.exceptions.ClientError as e:
print(f"Failed to read data from {key}: {e}")
except KeyError as e:
print(f"Failed to extract timestamp from line {line} in {key}: {e}")
except ValueError:
print(f"Failed to parse JSON from {key}")