Hey all, I have a large dataframe that I want to save as jsonl on s3. right now, I save it locally first and then upload it to s3. this is time and memory intensive. is it possible to save the df directly on s3?
sample code:
@batch(**{'cpu': 4, 'memory': 60000} )
@step
def genereate_result_df(self):
outdir = '/flow/outdir'
os.mkdir(outdir)
df_true.to_json(f'{outdir}/processed.jsonl',orient='records',lines=True)
df_error.to_json(f'{outdir}/error.jsonl',orient='records',lines=True)
file_list = [f for f in os.listdir(outdir) if os.path.isfile(os.path.join(outdir, f))]
key_path_list = [( f, os.path.join(outdir, f)) for f in file_list]
with S3(s3root=self.result_folder) as s3up:
s3up.put_files(key_path_list)
self.next(self.end)