I am trying to use a huggingface dataset across di...
# ask-metaflow
b
I am trying to use a huggingface dataset across different steps in a metaflow run. however, it errors out. my guess is that subsequent steps after loading the dataset cant access the cache of the previous step. is it possible to save a hf dataset as a metaflow artifcat? I am running this on aws batch. sample code and error below:
Copy code
@step
    def load_data(self):
        from datasets import load_dataset

        with S3() as s3:
            s3obj = s3.get(self.org_data)
            dataset = load_dataset("csv",data_files=s3obj.path)
        self.dataset = dataset
        print(self.dataset)
        self.next(self.eda)

    @step
    def eda(self):
        print(self.dataset)
error:
Copy code
File "flow.py", line 133, in pre_nightingale_eda
    print(self.dataset)
  File "/flow/metaflow/metaflow/flowspec.py", line 224, in __getattr__
    x = self._datastore[name]
  File "/flow/metaflow/metaflow/datastore/task_datastore.py", line 45, in method
    return f(self, args, kwargs)
  File "/flow/metaflow/metaflow/datastore/task_datastore.py", line 836, in __getitem__
    _, obj = next(self.load_artifacts([name]))
  File "/flow/metaflow/metaflow/datastore/task_datastore.py", line 370, in load_artifacts
    yield name, pickle.loads(blob)
  File "/usr/local/lib/python3.8/site-packages/datasets/table.py", line 1069, in __setstate__
    table = _memory_mapped_arrow_table_from_file(path)
  File "/usr/local/lib/python3.8/site-packages/datasets/table.py", line 65, in _memory_mapped_arrow_table_from_file
    opened_stream = _memory_mapped_record_batch_reader_from_file(filename)
  File "/usr/local/lib/python3.8/site-packages/datasets/table.py", line 50, in _memory_mapped_record_batch_reader_from_file
    memory_mapped_stream = pa.memory_map(filename)
  File "pyarrow/io.pxi", line 1009, in pyarrow.lib.memory_map
  File "pyarrow/io.pxi", line 956, in pyarrow.lib.MemoryMappedFile._open
  File "pyarrow/error.pxi", line 144, in pyarrow.lib.pyarrow_internal_check_status
  File "pyarrow/error.pxi", line 113, in pyarrow.lib.check_status
FileNotFoundError: [Errno 2] Failed to open local file '/root/.cache/huggingface/datasets/csv/default-42f4d214a7c91375/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/csv-train-00000-of-00009.arrow'. Detail: [errno 2] No such file or directory
1