Hey all, I have a large dataframe that I want to s...
# ask-metaflow
b
Hey all, I have a large dataframe that I want to save as jsonl on s3. right now, I save it locally first and then upload it to s3. this is time and memory intensive. is it possible to save the df directly on s3? sample code: @batch(**{'cpu': 4, 'memory': 60000} ) @step def genereate_result_df(self): outdir = '/flow/outdir' os.mkdir(outdir) df_true.to_json(f'{outdir}/processed.jsonl',orient='records',lines=True) df_error.to_json(f'{outdir}/error.jsonl',orient='records',lines=True) file_list = [f for f in os.listdir(outdir) if os.path.isfile(os.path.join(outdir, f))] key_path_list = [( f, os.path.join(outdir, f)) for f in file_list] with S3(s3root=self.result_folder) as s3up: s3up.put_files(key_path_list) self.next(self.end)
1